In [1]:
import numpy as np 
import pandas as pd
from glob import glob
import torch
from torch import optim
import torchvision
import timm
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
from PIL import Image
import random
import os
from torchvision.transforms import v2
from torch.utils.data import Dataset , DataLoader
import cv2
import matplotlib.pyplot as plt
import albumentations as A
from albumentations import (
    Compose, OneOf, Normalize, Resize, RandomResizedCrop, RandomCrop, HorizontalFlip, VerticalFlip, 
      RandomBrightnessContrast, Rotate, ShiftScaleRotate,  Transpose
    )
from albumentations.pytorch import ToTensorV2
from sklearn.model_selection import KFold
import torch.nn as nn
from contextlib import contextmanager
from torch.optim import Adam, SGD
from functools import partial
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau
import time
from sklearn.metrics import roc_auc_score
import math
In [2]:
from catalyst.data import BalanceClassSampler
In [3]:
txt_to_csv = False
device = 'cuda' if torch.cuda.is_available() else 'cpu'
DIR_PATH = "/kaggle/input/deepfake/phase1"
TRAIN_DIR = "/kaggle/input/deepfake/phase1/trainset"
TEST_DIR = "/kaggle/input/deepfake/phase1/valset"
OUTPUT_DIR = "/kaggle/working/"
class CFG : 
    seed = 42
    n_fold = 5
    target_col = 'target'
    train=True
    inference=False
    pseudo_labeling = True
    num_classes = 2 #binary class
    trn_fold=[1]
    debug=False
    apex=False
    print_freq=20 #every how many batch the scores get showed
    num_workers=4
#     model_name="eva02_large_patch14_448.mim_m38m_ft_in22k_in1k"
    model_name=  "efficientnet_b4"
    size=448
    scheduler='CosineAnnealingWarmRestarts' 
    epochs=2
    lr=1e-4
    min_lr=1e-6
    T_0=10 # CosineAnnealingWarmRestarts
    batch_size=16
    weight_decay=1e-6
    gradient_accumulation_steps=1
    max_grad_norm=1000
In [20]:
train = pd.read_csv(f"{DIR_PATH +'/trainset_label.txt'}")
test = pd.read_csv(f"{DIR_PATH +'/valset_label.txt'}")
In [21]:
if CFG.pseudo_labeling : 
    ps = pd.read_csv('/kaggle/input/pl-b4-first-epoch/b4_nTTA.csv')
    ps.rename(columns = {"label" : "target"} , inplace = True)
    to_add = ps[(ps['target']>0.9) | (ps['target']<0.1)]
#     print(to_add.shape)
    to_add["target"] = [1 if i>0.9 else 0 for i in to_add['target']]
    print(to_add["target"].value_counts())
    shape_before = train.shape
    train = pd.concat([train , to_add] , axis=0)
    shape_after = train.shape
    print(f"The shape of the train set have moved from {shape_before} => {shape_after}")
    train.reset_index(drop = True , inplace =True , )
target
1    87148
0    57023
Name: count, dtype: int64
The shape of the train set have moved from (524429, 2) => (668600, 2)
In [22]:
from sklearn.metrics import log_loss

def get_score(y_true, y_pred):
    num_classes = 2 
    total_log_loss = 0.0
    y_true = np.array([[0, 1] if i == 1 else [1, 0] for i in y_true])
#     print(y_true)
#     print(y_pred)
    for class_idx in range(num_classes):
        
        class_true = y_true[:,class_idx]  
        class_pred = y_pred[:, class_idx] 

        class_log_loss = log_loss(class_true, class_pred)
        total_log_loss += class_log_loss
    return total_log_loss
#     mean_log_loss = total_log_loss / num_classes
#     return mean_log_loss

# def get_score(y_true, y_pred):
#     # Ensure y_true and y_pred are 1D arrays
#     y_true = y_true.flatten()
#     y_pred = y_pred.flatten()

#     # Calculate the log loss directly
#     total_log_loss = log_loss(y_true, y_pred)



@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f'[{name}] start')
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s.')


def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()



def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)
In [23]:
if CFG.debug:
    CFG.epochs = 1
    train = train.sample(n=10000, random_state=CFG.seed).reset_index(drop=True)
    test = test.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)
In [24]:
files = glob(DIR_PATH+"/valset/*")
In [25]:
def len_txt(txt_file_path):
    with open(txt_file_path) as f:
        line_count = 0
        for line in f:
            line_count += 1
    return line_count
In [26]:
print(f"The train file contains {len_txt(DIR_PATH +'/trainset_label.txt')} elements")
print(f"The test file contains {len_txt(DIR_PATH +'/valset_label.txt')} elements")
The train file contains 524430 elements
The test file contains 147364 elements
In [27]:
# tkhalwidh
if txt_to_csv : 
    
    with open(DIR_PATH+"/trainset_label.txt") as f : 
        counter = 0
        for line in tqdm(f , desc = "Collecting train set") : 

            if counter >= 1 : 
                l = line.strip().split(",")
                new_row = {"img_name": l[0] , "target": l[1]}
                train.loc[len(train)] = new_row
            counter +=1
        
    with open(DIR_PATH+"/valset_label.txt") as f : 
        counter = 0
        for line in tqdm(f , desc = "Collecting test set") : 

            if counter >= 1 : 
                l = line.strip().split(",")
                new_row = {"img_name": l[0] , "target": l[1]}
                test.loc[len(test)] = new_row
            counter +=1
In [28]:
sns.countplot(data = train , x = train["target"])
Out[28]:
<Axes: xlabel='target', ylabel='count'>
In [29]:
class TrainDataset(Dataset) : 
    def __init__(self , df , transform = None) : 
        self.df = df 
        self.transform = transform
        self.file_names = df["img_name"].values
        self.labels = df["target"].values
        
    def __len__(self) : 
        return len(self.df)
    
    def __getitem__(self, idx):
        file_name = self.file_names[idx]

        # Check if the file is in the TRAIN_DIR or TEST_DIR
        file_path_train = f'{TRAIN_DIR}/{file_name}'
        file_path_test = f'{TEST_DIR}/{file_name}'

        if os.path.exists(file_path_train):
            file_path = file_path_train
        elif os.path.exists(file_path_test):
            file_path = file_path_test
        else:
            raise FileNotFoundError(f'File {file_name} not found in either TRAIN_DIR or TEST_DIR')

        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']

        label = torch.tensor(self.labels[idx]).long()
        return image, label
    
    def get_labels(self):
        return list(self.labels)
    
class TestDataset(Dataset) : 
    def __init__(self , df , transform = None) : 
        self.df = df 
        self.transform = transform
        self.file_names = df["img_name"].values
        
    def __len__(self) : 
        return len(self.df)
    
    def __getitem__(self , idx) : 
        
        file_name = self.file_names[idx]
        file_path = f'{TEST_DIR}/{file_name}' 
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image , cv2.COLOR_BGR2RGB)
        if self.transform : 
            augmented = self.transform(image=image)
            image = augmented['image']
            
        return image 
In [30]:
train_dataset = TrainDataset(train)
In [31]:
fig, axes = plt.subplots(2, 4, figsize=(10, 7))

for i in range(2):
    for j in range(4):
        index = i * 3 + j
        if index < len(train_dataset):
            image, label = train_dataset[index]
            axes[i, j].imshow(image)
            if label.numpy() == 1:
                axes[i, j].set_title("Fake", color="r")
            else:
                axes[i, j].set_title("Real", color="g")
            axes[i, j].axis('off')

plt.tight_layout()
plt.show()
In [32]:
from albumentations import Compose, RandomBrightnessContrast, RandomCrop, \
    HorizontalFlip, FancyPCA, HueSaturationValue, OneOf, ToGray, ISONoise, MultiplicativeNoise, CoarseDropout, MedianBlur, Blur, GlassBlur, MotionBlur, \
    ShiftScaleRotate, ImageCompression, PadIfNeeded, GaussNoise, GaussianBlur, ToSepia, RandomShadow, RandomGamma, Rotate, Resize
from albumentations import RandomBrightnessContrast 
from PIL import Image
# from transforms.albu import IsotropicResize, FFT, SR, DCT, CustomRandomCrop
import cv2
import numpy as np
import os 
import imageio

import random

import cv2
import numpy as np
import torch
from albumentations import DualTransform, ImageOnlyTransform
from albumentations.augmentations.crops.transforms import Crop


from skimage.color import rgb2hsv, rgb2gray, rgb2yuv
from skimage import color, exposure, transform
from skimage.exposure import equalize_hist
from albumentations import RandomCrop
from scipy.fftpack import dct, idct

def isotropically_resize_image(img, size, interpolation_down=cv2.INTER_AREA, interpolation_up=cv2.INTER_CUBIC):
    h, w = img.shape[:2]

    if max(w, h) == size:
        return img
    if w > h:
        scale = size / w
        h = h * scale
        w = size
    else:
        scale = size / h
        w = w * scale
        h = size
    interpolation = interpolation_up if scale > 1 else interpolation_down

    img = img.astype('uint8')
    resized = cv2.resize(img, (int(w), int(h)), interpolation=interpolation)
    return resized


class IsotropicResize(DualTransform):
    def __init__(self, max_side, interpolation_down=cv2.INTER_AREA, interpolation_up=cv2.INTER_CUBIC,
                 always_apply=False, p=1):
        super(IsotropicResize, self).__init__(always_apply, p)
        self.max_side = max_side
        self.interpolation_down = interpolation_down
        self.interpolation_up = interpolation_up

    def apply(self, img, interpolation_down=cv2.INTER_AREA, interpolation_up=cv2.INTER_CUBIC, **params):
        return isotropically_resize_image(img, size=self.max_side, interpolation_down=interpolation_down,
                                          interpolation_up=interpolation_up)

    def apply_to_mask(self, img, **params):
        return self.apply(img, interpolation_down=cv2.INTER_NEAREST, interpolation_up=cv2.INTER_NEAREST, **params)

    def get_transform_init_args_names(self):
        return ("max_side", "interpolation_down", "interpolation_up")


class Resize4xAndBack(ImageOnlyTransform):
    def __init__(self, always_apply=False, p=0.5):
        super(Resize4xAndBack, self).__init__(always_apply, p)

    def apply(self, img, **params):
        h, w = img.shape[:2]
        scale = random.choice([2, 4])
        img = cv2.resize(img, (w // scale, h // scale), interpolation=cv2.INTER_AREA)
        img = cv2.resize(img, (w, h),
                         interpolation=random.choice([cv2.INTER_CUBIC, cv2.INTER_LINEAR, cv2.INTER_NEAREST]))
        return img


class RandomSizedCropNonEmptyMaskIfExists(DualTransform):

    def __init__(self, min_max_height, w2h_ratio=[0.7, 1.3], always_apply=False, p=0.5):
        super(RandomSizedCropNonEmptyMaskIfExists, self).__init__(always_apply, p)

        self.min_max_height = min_max_height
        self.w2h_ratio = w2h_ratio

    def apply(self, img, x_min=0, x_max=0, y_min=0, y_max=0, **params):
        cropped = crop(img, x_min, y_min, x_max, y_max)
        return cropped

    @property
    def targets_as_params(self):
        return ["mask"]

    def get_params_dependent_on_targets(self, params):
        mask = params["mask"]
        mask_height, mask_width = mask.shape[:2]
        crop_height = int(mask_height * random.uniform(self.min_max_height[0], self.min_max_height[1]))
        w2h_ratio = random.uniform(*self.w2h_ratio)
        crop_width = min(int(crop_height * w2h_ratio), mask_width - 1)
        if mask.sum() == 0:
            x_min = random.randint(0, mask_width - crop_width + 1)
            y_min = random.randint(0, mask_height - crop_height + 1)
        else:
            mask = mask.sum(axis=-1) if mask.ndim == 3 else mask
            non_zero_yx = np.argwhere(mask)
            y, x = random.choice(non_zero_yx)
            x_min = x - random.randint(0, crop_width - 1)
            y_min = y - random.randint(0, crop_height - 1)
            x_min = np.clip(x_min, 0, mask_width - crop_width)
            y_min = np.clip(y_min, 0, mask_height - crop_height)

        x_max = x_min + crop_height
        y_max = y_min + crop_width
        y_max = min(mask_height, y_max)
        x_max = min(mask_width, x_max)
        return {"x_min": x_min, "x_max": x_max, "y_min": y_min, "y_max": y_max}

    def get_transform_init_args_names(self):
        return "min_max_height", "height", "width", "w2h_ratio"

class CustomRandomCrop(DualTransform):
    def __init__(self, size, p=0.5) -> None:
        super(CustomRandomCrop, self).__init__(p=p)
        self.size = size
        self.prob = p

    def apply(self, img, copy=True, **params):
        if img.shape[0] < self.size or img.shape[1] < self.size:
            transform = IsotropicResize(max_side=self.size, interpolation_down=cv2.INTER_LINEAR, interpolation_up=cv2.INTER_LINEAR)
        else:
            transform = RandomCrop(self.size, self.size)
        return np.asarray(transform(image=img)["image"])

class FFT(DualTransform):
    def __init__(self, mode, p=0.5) -> None:
        super(FFT, self).__init__(p=p)
        self.prob = p
        self.mode = mode

    def apply(self, img, copy=True, **params):
        dark_image_grey_fourier = np.fft.fftshift(np.fft.fft2(rgb2gray(img)))
        mask = np.log(abs(dark_image_grey_fourier)).astype(np.uint8)
        mask = cv2.resize(mask, (img.shape[1], img.shape[0]))
        if self.mode == 0:
            return np.asarray(cv2.bitwise_and(img, img, mask=mask))
        else:
            mask = np.asarray(mask)
            image =  cv2.merge((mask, mask, mask))
            return image

class SR(DualTransform):
    def __init__(self, model_sr, p=0.5) -> None:
        super(SR, self).__init__(p=p)
        self.prob = p
        self.model_sr = model_sr

    def apply(self, img, copy=True, **params):
        img = cv2.resize(img, (int(img.shape[1]/2), int(img.shape[0]/2)), interpolation = cv2.INTER_AREA)
        img = np.transpose(img, (2, 0, 1))
        img = torch.tensor(img, dtype=torch.float).unsqueeze(0).to(2)
        sr_img = self.model_sr(img)
        return sr_img.squeeze(0).permute(1, 2, 0).detach().cpu().numpy()


class DCT(DualTransform):
    def __init__(self, mode, p=0.5) -> None:
        super(DCT, self).__init__(p=p)
        self.prob = p
        self.mode = mode

    def rgb2gray(self, rgb):
        return cv2.cvtColor(rgb, cv2.COLOR_BGR2GRAY)

    def apply(self, img, copy=True, **params):
        gray_img = self.rgb2gray(img)
        dct_coefficients = cv2.dct(cv2.dct(np.float32(gray_img), flags=cv2.DCT_ROWS), flags=cv2.DCT_ROWS)
        epsilon = 1
        mask = np.log(np.abs(dct_coefficients) + epsilon).astype(np.uint8)
        mask = cv2.resize(mask, (img.shape[1], img.shape[0]))


        if self.mode == 0:
            return cv2.bitwise_and(img, img, mask=mask)
        else:
            dct_coefficients = np.asarray(dct_coefficients)
            image = cv2.merge((dct_coefficients, dct_coefficients, dct_coefficients))
            return image
In [33]:
import albumentations as A
def get_transforms(* , data) : 
    size = CFG.size

    if data == 'train':
        return Compose([
        ImageCompression(quality_lower=40, quality_upper=100, p=0.1),
        HorizontalFlip(),
        GaussNoise(p=0.3),
        ISONoise(p=0.3),
        MultiplicativeNoise(p=0.3),
        OneOf([
            IsotropicResize(max_side=size, interpolation_down=cv2.INTER_AREA, interpolation_up=cv2.INTER_CUBIC),
            IsotropicResize(max_side=size, interpolation_down=cv2.INTER_AREA, interpolation_up=cv2.INTER_LINEAR),
            IsotropicResize(max_side=size, interpolation_down=cv2.INTER_LINEAR, interpolation_up=cv2.INTER_LINEAR),
            CustomRandomCrop(size=size)
        ], p=1),
        Resize(height=size, width=size),
        PadIfNeeded(min_height=size, min_width=size, border_mode=cv2.BORDER_CONSTANT , value=0 , p=1),
        OneOf([RandomBrightnessContrast(), FancyPCA(), HueSaturationValue()], p=0.5),
        OneOf([CoarseDropout()], p=0.05),
        ToGray(p=0.1),
        ToSepia(p=0.05),
        RandomShadow(p=0.05),
        RandomGamma(p=0.1),
        ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=10, border_mode=cv2.BORDER_CONSTANT, p=0.5),
        FFT(mode=0, p=0.05),
        DCT(mode=1, p=0.5) ,
        Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
        ),
        ToTensorV2(),
    ])
        
        
        
        
        
        
        
        
#         return Compose([
#             Resize(CFG.size, CFG.size),
#             #RandomResizedCrop(CFG.size, CFG.size),
#             Transpose(p=0.5),
#             HorizontalFlip(p=0.5),
#             VerticalFlip(p=0.5),
#             ShiftScaleRotate(p=0.5),
#             A.CoarseDropout(p=0.5),
#             Normalize(
#                 mean=[0.485, 0.456, 0.406],
#                 std=[0.229, 0.224, 0.225],
#             ),
#             ToTensorV2(),
#         ])

    elif data == 'valid':
        return Compose([
            IsotropicResize(max_side=size, interpolation_down=cv2.INTER_AREA, interpolation_up=cv2.INTER_CUBIC),
            Resize(CFG.size, CFG.size),
            PadIfNeeded(min_height=size, min_width=size, border_mode=cv2.BORDER_CONSTANT , value=0 ),
            
            
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])
In [34]:
train_dataset = TrainDataset(train , transform= get_transforms(data = "train"))
In [35]:
# train_dataset.get_labels()
In [36]:
fig, axes = plt.subplots(2, 4, figsize=(10, 7))

for i in range(2):
    for j in range(4):
        index = i * 3 + j
        if index < len(train_dataset):
            image, label = train_dataset[index]
            axes[i, j].imshow(image.permute(1,2,0))
            if label.numpy() == 1:
                axes[i, j].set_title("Fake", color="r")
            else:
                axes[i, j].set_title("Real", color="g")
            axes[i, j].axis('off')

plt.tight_layout()
plt.show()
In [37]:
folds = train.copy()
Fold = KFold(n_splits = CFG.n_fold  , shuffle = True , random_state = CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[CFG.target_col])):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
In [38]:
class CustomResNext(nn.Module) : 
    def __init__(self , model = 'resnext50_32x4d', pretrained = False , num_classes = 2):
        super().__init__()
        self.model = timm.create_model(CFG.model_name ,
                                    pretrained = pretrained , 
                                    drop_rate = 0.1,
                                    drop_path_rate = 0.2,
                                    num_classes = num_classes
                                )
        
        
    def forward(self , x): 
        return self.model(x)
In [39]:
model = CustomResNext(pretrained = False) 
model(train_dataset[0][0].unsqueeze(1).permute(1,0,2,3))
Out[39]:
tensor([[ 1.4166, -6.5389]], grad_fn=<AddmmBackward0>)
In [40]:
import wandb

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("wandb_api")
    wandb.login(key=api_key)
    anonymous = None
except:
    anonymous = "must"
    print('To use your W&B account,\nGo to Add-ons -> Secrets and provide your W&B access token. Use the Label name as WANDB. \nGet your W&B access token from here: https://wandb.ai/authorize')
wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin
wandb: WARNING If you're specifying your api key in code, ensure this code is not shared publicly.
wandb: WARNING Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.
wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc
In [41]:
run = wandb.init(entity = 'lassouedaymenla',
                 project = 'tutorial',
                 save_code = True,
                 name = "efficientnet_b4_epcoh2"
)
wandb: Currently logged in as: lassouedaymenla. Use `wandb login --relogin` to force relogin
wandb version 0.17.5 is available! To upgrade, please run: $ pip install wandb --upgrade
Tracking run with wandb version 0.17.4
Run data is saved locally in /kaggle/working/wandb/run-20240722_133256-9hfw3672
Syncing run efficientnet_b4_epcoh2 to Weights & Biases (docs)
View project at https://wandb.ai/lassouedaymenla/tutorial
View run at https://wandb.ai/lassouedaymenla/tutorial/runs/9hfw3672
In [42]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    for step, (images, labels) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        y_preds = model(images)
        
        labels = labels.cuda()
        y_preds = y_preds.cuda()
# debug
#         print(torch.nn.functional.softmax(y_preds, dim=1))
#         print(labels)
        loss = criterion(y_preds, labels)
        # record loss
        losses.update(loss.item(), batch_size)
        
#         # Logging to wandb
#         wandb.log({"Training Loss": loss.item(), "Epoch": epoch, "Step": global_step})
        
        
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        if CFG.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  #'LR: {lr:.6f}  '
                  .format(
                   epoch+1, step, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(train_loader)),
                   grad_norm=grad_norm,
                   #lr=scheduler.get_lr()[0],
                   ))
            
#     # Log epoch summary to wandb
#     wandb.log({"Epoch Training Loss": losses.avg, "Epoch": epoch})

        wandb.log({
    "Train Loss": losses.val,
    "Step": step,
    "Gradient Norm": grad_norm,
    "Learning Rate": optimizer.param_groups[0]['lr']  # Add this line to log the learning rate
})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, (images, labels) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        # compute loss
        with torch.no_grad():
            y_preds = model(images)
            
        labels = labels.cuda()
        y_preds = y_preds.cuda()
        
        loss = criterion(y_preds, labels)
        losses.update(loss.item(), batch_size)
        
        y_preds = torch.nn.functional.softmax(y_preds, dim=1)
        # record accuracy
        y_preds = y_preds.to('cpu').numpy()
       
       

        
        preds.append(y_preds)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(
                   step, len(valid_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses,
                   remain=timeSince(start, float(step+1)/len(valid_loader)),
                   ))
        wandb.log({
            "Val Loss ": losses.val,
            "Val Step": step , 
            
        })           
    predictions = np.concatenate(preds)
    return losses.avg, predictions



def inference(model, states, test_loader, device):
    model.to(device)
    tk0 = tqdm(enumerate(test_loader), total=len(test_loader))
    probs = []
    for i, (images) in tk0:
        images = images.to(device)
        avg_preds = []
        for state in states:
            model.load_state_dict(state['model'])
            model.eval()
            with torch.no_grad():
#                 print(images.shape)
                y_preds = model(images)
            avg_preds.append(y_preds.to('cpu').numpy())
        avg_preds = np.mean(avg_preds, axis=0)
        probs.append(avg_preds)
    probs = np.concatenate(probs)
    return probs
In [43]:
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index

    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)

    train_dataset = TrainDataset(train_folds, 
                                 transform=get_transforms(data='train'))
    valid_dataset = TrainDataset(valid_folds, 
                                 transform=get_transforms(data='valid'))

    train_loader = DataLoader(train_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=False, 
                              num_workers=CFG.num_workers, sampler=BalanceClassSampler(labels=train_dataset.get_labels(), mode="upsampling") , 
                              pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=False, 
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    
    # ====================================================
    # scheduler 
    # ====================================================
    def get_scheduler(optimizer):
        if CFG.scheduler=='ReduceLROnPlateau':
            scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps)
        elif CFG.scheduler=='CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1)
        elif CFG.scheduler=='CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1)
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    
    checkpoint = torch.load('/kaggle/input/b4ntta1epoch/pytorch/default/1/efficientnet_b4_fold0_best.pth')
    model = CustomResNext(CFG.model_name, pretrained=True)
    model.load_state_dict(checkpoint['model'])
    model.to(device)
    
#     model = CustomResNext(CFG.model_name, pretrained=True)
#     model.to(device)

    optimizer = Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay, amsgrad=False)
    scheduler = get_scheduler(optimizer)

    # ====================================================
    # apex
    # ====================================================
    if CFG.apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.CrossEntropyLoss().cuda()


    best_score = 50000
    best_loss = np.inf
    
    for epoch in range(CFG.epochs):
        
        start_time = time.time()
        
        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)
        valid_labels = valid_folds[CFG.target_col].values
        
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score = get_score(valid_labels, preds)
        print(score)
        preds= torch.nn.functional.softmax(torch.from_numpy(preds), dim=1).numpy()[:,1]
        score2 = roc_auc_score(valid_labels, preds)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s') #.info makes the msg shows in red cadre
        LOGGER.info(f'Epoch {epoch+1} - LogLoss: {score} - AUC: {score2}')

        if score < best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(), 
                        'preds': preds},
                        OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
    
    check_point = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
    #valid_folds[[str(c) for c in range(5)]] = check_point['preds']
    #valid_folds['preds'] = check_point['preds'].argmax(1)

    return 
In [44]:
def main():

    """
    Prepare: 1.train  2.test  3.submission  4.folds
    """

    def get_result(result_df):
        preds = result_df['preds'].values
        labels = result_df[CFG.target_col].values
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.5f}')
    
    if CFG.train:
        # train 
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                train_loop(folds, fold)
                #oof_df = pd.concat([oof_df, _oof_df])
                #LOGGER.info(f"========== fold: {fold} result ==========")
                #get_result(_oof_df)
        # CV result
        LOGGER.info(f"========== CV ==========")
        #get_result(oof_df)
        # save result
        #oof_df.to_csv(OUTPUT_DIR+'oof_df.csv', index=False)
    
    if CFG.inference:
        # inference
        model = CustomResNext(CFG.model_name, pretrained=False)
        states = [torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth') for fold in CFG.trn_fold]
        test_dataset = TestDataset(test, transform=get_transforms(data='valid'))
        test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, 
                                 num_workers=CFG.num_workers, pin_memory=True)
        predictions = inference(model, states, test_loader, device)
        # submission
        print(predictions)
        test['label'] = torch.nn.functional.softmax(torch.from_numpy(predictions), dim=1).numpy()[:,1]
        print(test['label'])
        test[['img_name', 'label']].to_csv(OUTPUT_DIR+'submission.csv', index=False)
In [45]:
if __name__ == '__main__':                                                                                              
    main()
========== fold: 1 training ==========
model.safetensors:   0%|          | 0.00/77.9M [00:00<?, ?B/s]
Epoch: [1][0/51233] Data 1.522 (1.522) Elapsed 0m 3s (remain 3167m 52s) Loss: 0.0062(0.0062) Grad: 0.0898  
Epoch: [1][20/51233] Data 0.311 (0.351) Elapsed 0m 14s (remain 571m 52s) Loss: 0.0155(0.0518) Grad: 0.3489  
Epoch: [1][40/51233] Data 0.317 (0.333) Elapsed 0m 24s (remain 514m 27s) Loss: 0.0424(0.0777) Grad: 0.8080  
Epoch: [1][60/51233] Data 0.305 (0.326) Elapsed 0m 35s (remain 494m 37s) Loss: 0.1890(0.0871) Grad: 1.1324  
Epoch: [1][80/51233] Data 0.316 (0.321) Elapsed 0m 46s (remain 484m 29s) Loss: 0.0046(0.0827) Grad: 0.0820  
Epoch: [1][100/51233] Data 0.314 (0.319) Elapsed 0m 56s (remain 478m 15s) Loss: 0.0032(0.0732) Grad: 0.0466  
Epoch: [1][120/51233] Data 0.315 (0.317) Elapsed 1m 7s (remain 474m 7s) Loss: 0.0056(0.0710) Grad: 0.1352  
Epoch: [1][140/51233] Data 0.318 (0.316) Elapsed 1m 18s (remain 471m 3s) Loss: 0.2276(0.0714) Grad: 3.3000  
Epoch: [1][160/51233] Data 0.318 (0.316) Elapsed 1m 28s (remain 468m 43s) Loss: 0.0168(0.0749) Grad: 0.7735  
Epoch: [1][180/51233] Data 0.316 (0.315) Elapsed 1m 39s (remain 466m 49s) Loss: 0.0292(0.0743) Grad: 0.3720  
Epoch: [1][200/51233] Data 0.311 (0.314) Elapsed 1m 49s (remain 465m 22s) Loss: 0.0933(0.0748) Grad: 1.1465  
Epoch: [1][220/51233] Data 0.317 (0.314) Elapsed 2m 0s (remain 464m 4s) Loss: 0.0083(0.0738) Grad: 0.2419  
Epoch: [1][240/51233] Data 0.307 (0.314) Elapsed 2m 11s (remain 463m 4s) Loss: 0.0162(0.0723) Grad: 0.3698  
Epoch: [1][260/51233] Data 0.308 (0.314) Elapsed 2m 21s (remain 462m 0s) Loss: 0.1068(0.0717) Grad: 1.1143  
Epoch: [1][280/51233] Data 0.308 (0.313) Elapsed 2m 32s (remain 461m 10s) Loss: 0.0491(0.0729) Grad: 1.4455  
Epoch: [1][300/51233] Data 0.309 (0.313) Elapsed 2m 43s (remain 460m 24s) Loss: 0.0070(0.0719) Grad: 0.1638  
Epoch: [1][320/51233] Data 0.313 (0.313) Elapsed 2m 53s (remain 459m 44s) Loss: 0.1902(0.0733) Grad: 2.6777  
Epoch: [1][340/51233] Data 0.307 (0.312) Elapsed 3m 4s (remain 459m 6s) Loss: 0.1224(0.0733) Grad: 1.2253  
Epoch: [1][360/51233] Data 0.318 (0.312) Elapsed 3m 15s (remain 458m 30s) Loss: 0.0987(0.0728) Grad: 1.2940  
Epoch: [1][380/51233] Data 0.310 (0.312) Elapsed 3m 25s (remain 457m 57s) Loss: 0.0235(0.0730) Grad: 0.7193  
Epoch: [1][400/51233] Data 0.316 (0.312) Elapsed 3m 36s (remain 457m 27s) Loss: 0.0465(0.0731) Grad: 0.8473  
Epoch: [1][420/51233] Data 0.318 (0.312) Elapsed 3m 47s (remain 457m 0s) Loss: 0.1262(0.0727) Grad: 1.5813  
Epoch: [1][440/51233] Data 0.308 (0.312) Elapsed 3m 57s (remain 456m 32s) Loss: 0.0099(0.0740) Grad: 0.1856  
Epoch: [1][460/51233] Data 0.305 (0.312) Elapsed 4m 8s (remain 456m 6s) Loss: 0.1919(0.0732) Grad: 1.2267  
Epoch: [1][480/51233] Data 0.317 (0.312) Elapsed 4m 19s (remain 455m 44s) Loss: 0.1983(0.0727) Grad: 2.0386  
Epoch: [1][500/51233] Data 0.310 (0.312) Elapsed 4m 29s (remain 455m 21s) Loss: 0.1229(0.0731) Grad: 1.6142  
Epoch: [1][520/51233] Data 0.317 (0.312) Elapsed 4m 40s (remain 454m 59s) Loss: 0.0231(0.0739) Grad: 0.2789  
Epoch: [1][540/51233] Data 0.309 (0.312) Elapsed 4m 51s (remain 454m 37s) Loss: 0.1984(0.0746) Grad: 1.0971  
Epoch: [1][560/51233] Data 0.308 (0.312) Elapsed 5m 1s (remain 454m 17s) Loss: 0.0017(0.0749) Grad: 0.0224  
Epoch: [1][580/51233] Data 0.316 (0.312) Elapsed 5m 12s (remain 453m 58s) Loss: 0.0366(0.0743) Grad: 0.4130  
Epoch: [1][600/51233] Data 0.290 (0.312) Elapsed 5m 23s (remain 453m 39s) Loss: 0.0029(0.0733) Grad: 0.0371  
Epoch: [1][620/51233] Data 0.316 (0.312) Elapsed 5m 33s (remain 453m 20s) Loss: 0.1063(0.0736) Grad: 0.9104  
Epoch: [1][640/51233] Data 0.318 (0.312) Elapsed 5m 44s (remain 453m 1s) Loss: 0.1326(0.0736) Grad: 1.6277  
Epoch: [1][660/51233] Data 0.301 (0.311) Elapsed 5m 55s (remain 452m 44s) Loss: 0.0131(0.0731) Grad: 0.2489  
Epoch: [1][680/51233] Data 0.316 (0.312) Elapsed 6m 5s (remain 452m 27s) Loss: 0.0061(0.0732) Grad: 0.1671  
Epoch: [1][700/51233] Data 0.311 (0.312) Elapsed 6m 16s (remain 452m 10s) Loss: 0.0090(0.0735) Grad: 0.2183  
Epoch: [1][720/51233] Data 0.311 (0.312) Elapsed 6m 27s (remain 451m 53s) Loss: 0.0367(0.0736) Grad: 0.5295  
Epoch: [1][740/51233] Data 0.310 (0.312) Elapsed 6m 37s (remain 451m 37s) Loss: 0.0895(0.0726) Grad: 1.3477  
Epoch: [1][760/51233] Data 0.311 (0.312) Elapsed 6m 48s (remain 451m 22s) Loss: 0.0095(0.0722) Grad: 0.2162  
Epoch: [1][780/51233] Data 0.314 (0.312) Elapsed 6m 58s (remain 451m 6s) Loss: 0.0208(0.0729) Grad: 0.5596  
Epoch: [1][800/51233] Data 0.308 (0.312) Elapsed 7m 9s (remain 450m 50s) Loss: 0.1351(0.0728) Grad: 1.9764  
Epoch: [1][820/51233] Data 0.308 (0.311) Elapsed 7m 20s (remain 450m 35s) Loss: 0.0081(0.0726) Grad: 0.1377  
Epoch: [1][840/51233] Data 0.318 (0.311) Elapsed 7m 30s (remain 450m 21s) Loss: 0.0335(0.0722) Grad: 0.9208  
Epoch: [1][860/51233] Data 0.309 (0.311) Elapsed 7m 41s (remain 450m 6s) Loss: 0.0063(0.0716) Grad: 0.1062  
Epoch: [1][880/51233] Data 0.308 (0.311) Elapsed 7m 52s (remain 449m 52s) Loss: 0.2228(0.0727) Grad: 1.3863  
Epoch: [1][900/51233] Data 0.318 (0.311) Elapsed 8m 2s (remain 449m 38s) Loss: 0.0192(0.0718) Grad: 0.3267  
Epoch: [1][920/51233] Data 0.316 (0.311) Elapsed 8m 13s (remain 449m 24s) Loss: 0.0473(0.0724) Grad: 0.7550  
Epoch: [1][940/51233] Data 0.309 (0.311) Elapsed 8m 24s (remain 449m 11s) Loss: 0.0045(0.0723) Grad: 0.0698  
Epoch: [1][960/51233] Data 0.289 (0.311) Elapsed 8m 34s (remain 448m 58s) Loss: 0.0222(0.0718) Grad: 0.4500  
Epoch: [1][980/51233] Data 0.308 (0.311) Elapsed 8m 45s (remain 448m 42s) Loss: 0.2311(0.0726) Grad: 1.5863  
Epoch: [1][1000/51233] Data 0.318 (0.311) Elapsed 8m 56s (remain 448m 28s) Loss: 0.0499(0.0730) Grad: 0.5311  
Epoch: [1][1020/51233] Data 0.310 (0.311) Elapsed 9m 6s (remain 448m 15s) Loss: 0.0377(0.0729) Grad: 0.6407  
Epoch: [1][1040/51233] Data 0.318 (0.311) Elapsed 9m 17s (remain 448m 1s) Loss: 0.0173(0.0728) Grad: 0.2590  
Epoch: [1][1060/51233] Data 0.307 (0.311) Elapsed 9m 28s (remain 447m 48s) Loss: 0.0064(0.0724) Grad: 0.0776  
Epoch: [1][1080/51233] Data 0.307 (0.311) Elapsed 9m 38s (remain 447m 35s) Loss: 0.0148(0.0728) Grad: 0.2195  
Epoch: [1][1100/51233] Data 0.307 (0.311) Elapsed 9m 49s (remain 447m 22s) Loss: 0.0092(0.0725) Grad: 0.0974  
Epoch: [1][1120/51233] Data 0.318 (0.311) Elapsed 10m 0s (remain 447m 9s) Loss: 0.0694(0.0724) Grad: 0.8905  
Epoch: [1][1140/51233] Data 0.310 (0.311) Elapsed 10m 10s (remain 446m 56s) Loss: 0.0322(0.0725) Grad: 0.4937  
Epoch: [1][1160/51233] Data 0.317 (0.311) Elapsed 10m 21s (remain 446m 50s) Loss: 0.0776(0.0727) Grad: 0.7981  
Epoch: [1][1180/51233] Data 0.318 (0.311) Elapsed 10m 32s (remain 446m 37s) Loss: 0.1799(0.0723) Grad: 2.1466  
Epoch: [1][1200/51233] Data 0.313 (0.311) Elapsed 10m 42s (remain 446m 24s) Loss: 0.0305(0.0723) Grad: 0.6932  
Epoch: [1][1220/51233] Data 0.310 (0.311) Elapsed 10m 53s (remain 446m 11s) Loss: 0.0453(0.0725) Grad: 0.5013  
Epoch: [1][1240/51233] Data 0.307 (0.311) Elapsed 11m 4s (remain 445m 59s) Loss: 0.0126(0.0725) Grad: 0.1880  
Epoch: [1][1260/51233] Data 0.308 (0.311) Elapsed 11m 14s (remain 445m 46s) Loss: 0.0305(0.0723) Grad: 0.5323  
Epoch: [1][1280/51233] Data 0.310 (0.311) Elapsed 11m 25s (remain 445m 33s) Loss: 0.0012(0.0726) Grad: 0.0207  
Epoch: [1][1300/51233] Data 0.318 (0.311) Elapsed 11m 36s (remain 445m 21s) Loss: 0.0871(0.0728) Grad: 0.8117  
Epoch: [1][1320/51233] Data 0.309 (0.311) Elapsed 11m 46s (remain 445m 9s) Loss: 0.4625(0.0732) Grad: 3.0907  
Epoch: [1][1340/51233] Data 0.306 (0.311) Elapsed 11m 57s (remain 444m 56s) Loss: 0.0457(0.0729) Grad: 0.4612  
Epoch: [1][1360/51233] Data 0.317 (0.311) Elapsed 12m 8s (remain 444m 45s) Loss: 0.0422(0.0729) Grad: 0.8795  
Epoch: [1][1380/51233] Data 0.305 (0.311) Elapsed 12m 18s (remain 444m 32s) Loss: 0.2487(0.0727) Grad: 1.8679  
Epoch: [1][1400/51233] Data 0.310 (0.311) Elapsed 12m 29s (remain 444m 19s) Loss: 0.0714(0.0730) Grad: 1.1284  
Epoch: [1][1420/51233] Data 0.317 (0.311) Elapsed 12m 40s (remain 444m 7s) Loss: 0.3021(0.0733) Grad: 2.0149  
Epoch: [1][1440/51233] Data 0.309 (0.311) Elapsed 12m 50s (remain 443m 56s) Loss: 0.0550(0.0735) Grad: 0.5636  
Epoch: [1][1460/51233] Data 0.309 (0.311) Elapsed 13m 1s (remain 443m 43s) Loss: 0.2252(0.0734) Grad: 1.2033  
Epoch: [1][1480/51233] Data 0.309 (0.311) Elapsed 13m 12s (remain 443m 31s) Loss: 0.0098(0.0729) Grad: 0.1506  
Epoch: [1][1500/51233] Data 0.266 (0.311) Elapsed 13m 22s (remain 443m 19s) Loss: 0.0084(0.0724) Grad: 0.1091  
Epoch: [1][1520/51233] Data 0.309 (0.311) Elapsed 13m 33s (remain 443m 7s) Loss: 0.1998(0.0726) Grad: 2.4084  
Epoch: [1][1540/51233] Data 0.309 (0.311) Elapsed 13m 44s (remain 442m 55s) Loss: 0.0055(0.0722) Grad: 0.0583  
Epoch: [1][1560/51233] Data 0.317 (0.311) Elapsed 13m 54s (remain 442m 43s) Loss: 0.0740(0.0721) Grad: 0.8136  
Epoch: [1][1580/51233] Data 0.311 (0.311) Elapsed 14m 5s (remain 442m 31s) Loss: 0.0124(0.0715) Grad: 0.3947  
Epoch: [1][1600/51233] Data 0.309 (0.311) Elapsed 14m 16s (remain 442m 19s) Loss: 0.0088(0.0716) Grad: 0.1800  
Epoch: [1][1620/51233] Data 0.311 (0.311) Elapsed 14m 26s (remain 442m 8s) Loss: 0.1845(0.0717) Grad: 2.8496  
Epoch: [1][1640/51233] Data 0.317 (0.311) Elapsed 14m 37s (remain 441m 56s) Loss: 0.0192(0.0716) Grad: 0.2670  
Epoch: [1][1660/51233] Data 0.312 (0.311) Elapsed 14m 48s (remain 441m 44s) Loss: 0.0157(0.0715) Grad: 0.1674  
Epoch: [1][1680/51233] Data 0.319 (0.311) Elapsed 14m 58s (remain 441m 32s) Loss: 0.0994(0.0713) Grad: 1.5535  
Epoch: [1][1700/51233] Data 0.309 (0.311) Elapsed 15m 9s (remain 441m 20s) Loss: 0.0847(0.0712) Grad: 1.3566  
Epoch: [1][1720/51233] Data 0.310 (0.311) Elapsed 15m 20s (remain 441m 9s) Loss: 0.0089(0.0714) Grad: 0.2159  
Epoch: [1][1740/51233] Data 0.318 (0.311) Elapsed 15m 30s (remain 440m 57s) Loss: 0.2927(0.0714) Grad: 2.0369  
Epoch: [1][1760/51233] Data 0.310 (0.311) Elapsed 15m 41s (remain 440m 45s) Loss: 0.0041(0.0712) Grad: 0.0797  
Epoch: [1][1780/51233] Data 0.308 (0.311) Elapsed 15m 52s (remain 440m 34s) Loss: 0.0595(0.0708) Grad: 1.1097  
Epoch: [1][1800/51233] Data 0.317 (0.311) Elapsed 16m 2s (remain 440m 22s) Loss: 0.0705(0.0708) Grad: 1.1656  
Epoch: [1][1820/51233] Data 0.309 (0.311) Elapsed 16m 13s (remain 440m 11s) Loss: 0.0529(0.0707) Grad: 0.9678  
Epoch: [1][1840/51233] Data 0.318 (0.311) Elapsed 16m 23s (remain 439m 59s) Loss: 0.1559(0.0712) Grad: 1.6128  
Epoch: [1][1860/51233] Data 0.308 (0.311) Elapsed 16m 34s (remain 439m 47s) Loss: 0.0068(0.0709) Grad: 0.0990  
Epoch: [1][1880/51233] Data 0.300 (0.311) Elapsed 16m 45s (remain 439m 36s) Loss: 0.1103(0.0713) Grad: 1.5675  
Epoch: [1][1900/51233] Data 0.318 (0.311) Elapsed 16m 55s (remain 439m 25s) Loss: 0.0088(0.0713) Grad: 0.1380  
Epoch: [1][1920/51233] Data 0.316 (0.311) Elapsed 17m 6s (remain 439m 13s) Loss: 0.0767(0.0712) Grad: 0.8389  
Epoch: [1][1940/51233] Data 0.308 (0.311) Elapsed 17m 17s (remain 439m 2s) Loss: 0.0154(0.0715) Grad: 0.2113  
Epoch: [1][1960/51233] Data 0.318 (0.311) Elapsed 17m 27s (remain 438m 50s) Loss: 0.2108(0.0713) Grad: 2.6581  
Epoch: [1][1980/51233] Data 0.307 (0.311) Elapsed 17m 38s (remain 438m 39s) Loss: 0.0042(0.0710) Grad: 0.0676  
Epoch: [1][2000/51233] Data 0.307 (0.311) Elapsed 17m 49s (remain 438m 27s) Loss: 0.0208(0.0710) Grad: 0.3202  
Epoch: [1][2020/51233] Data 0.317 (0.311) Elapsed 17m 59s (remain 438m 16s) Loss: 0.0072(0.0710) Grad: 0.1261  
Epoch: [1][2040/51233] Data 0.303 (0.311) Elapsed 18m 10s (remain 438m 4s) Loss: 0.0242(0.0716) Grad: 0.4031  
Epoch: [1][2060/51233] Data 0.316 (0.311) Elapsed 18m 21s (remain 437m 53s) Loss: 0.0807(0.0715) Grad: 0.9381  
Epoch: [1][2080/51233] Data 0.312 (0.311) Elapsed 18m 31s (remain 437m 42s) Loss: 0.3448(0.0717) Grad: 2.6722  
Epoch: [1][2100/51233] Data 0.317 (0.311) Elapsed 18m 42s (remain 437m 30s) Loss: 0.0342(0.0718) Grad: 0.4223  
Epoch: [1][2120/51233] Data 0.299 (0.311) Elapsed 18m 53s (remain 437m 19s) Loss: 0.0525(0.0717) Grad: 1.3755  
Epoch: [1][2140/51233] Data 0.309 (0.311) Elapsed 19m 3s (remain 437m 8s) Loss: 0.0152(0.0715) Grad: 0.2075  
Epoch: [1][2160/51233] Data 0.316 (0.311) Elapsed 19m 14s (remain 436m 57s) Loss: 0.0115(0.0713) Grad: 0.1692  
Epoch: [1][2180/51233] Data 0.307 (0.311) Elapsed 19m 25s (remain 436m 46s) Loss: 0.1847(0.0714) Grad: 1.6257  
Epoch: [1][2200/51233] Data 0.297 (0.311) Elapsed 19m 35s (remain 436m 34s) Loss: 0.0121(0.0714) Grad: 0.3053  
Epoch: [1][2220/51233] Data 0.308 (0.311) Elapsed 19m 46s (remain 436m 23s) Loss: 0.0199(0.0717) Grad: 0.2247  
Epoch: [1][2240/51233] Data 0.317 (0.311) Elapsed 19m 57s (remain 436m 12s) Loss: 0.1186(0.0716) Grad: 1.1624  
Epoch: [1][2260/51233] Data 0.306 (0.311) Elapsed 20m 7s (remain 436m 0s) Loss: 0.0197(0.0715) Grad: 0.3129  
Epoch: [1][2280/51233] Data 0.314 (0.311) Elapsed 20m 18s (remain 435m 49s) Loss: 0.1874(0.0714) Grad: 1.1681  
Epoch: [1][2300/51233] Data 0.318 (0.311) Elapsed 20m 29s (remain 435m 38s) Loss: 0.0160(0.0712) Grad: 0.2008  
Epoch: [1][2320/51233] Data 0.309 (0.311) Elapsed 20m 39s (remain 435m 26s) Loss: 0.0092(0.0712) Grad: 0.2429  
Epoch: [1][2340/51233] Data 0.317 (0.311) Elapsed 20m 50s (remain 435m 15s) Loss: 0.0100(0.0711) Grad: 0.2066  
Epoch: [1][2360/51233] Data 0.300 (0.311) Elapsed 21m 1s (remain 435m 4s) Loss: 0.1685(0.0712) Grad: 1.3652  
Epoch: [1][2380/51233] Data 0.314 (0.311) Elapsed 21m 11s (remain 434m 53s) Loss: 0.0219(0.0711) Grad: 0.2776  
Epoch: [1][2400/51233] Data 0.308 (0.311) Elapsed 21m 22s (remain 434m 42s) Loss: 0.0919(0.0711) Grad: 1.1912  
Epoch: [1][2420/51233] Data 0.318 (0.311) Elapsed 21m 33s (remain 434m 31s) Loss: 0.0787(0.0709) Grad: 0.6027  
Epoch: [1][2440/51233] Data 0.293 (0.311) Elapsed 21m 43s (remain 434m 19s) Loss: 0.0952(0.0710) Grad: 1.8801  
Epoch: [1][2460/51233] Data 0.307 (0.311) Elapsed 21m 54s (remain 434m 8s) Loss: 0.0118(0.0710) Grad: 0.1501  
Epoch: [1][2480/51233] Data 0.317 (0.311) Elapsed 22m 5s (remain 433m 57s) Loss: 0.2051(0.0710) Grad: 2.3615  
Epoch: [1][2500/51233] Data 0.317 (0.311) Elapsed 22m 15s (remain 433m 46s) Loss: 0.2099(0.0710) Grad: 2.0290  
Epoch: [1][2520/51233] Data 0.296 (0.311) Elapsed 22m 26s (remain 433m 35s) Loss: 0.0233(0.0708) Grad: 0.3560  
Epoch: [1][2540/51233] Data 0.305 (0.311) Elapsed 22m 37s (remain 433m 24s) Loss: 0.0226(0.0711) Grad: 0.3494  
Epoch: [1][2560/51233] Data 0.296 (0.311) Elapsed 22m 47s (remain 433m 12s) Loss: 0.0033(0.0709) Grad: 0.0385  
Epoch: [1][2580/51233] Data 0.317 (0.311) Elapsed 22m 58s (remain 433m 1s) Loss: 0.1530(0.0709) Grad: 1.1309  
Epoch: [1][2600/51233] Data 0.301 (0.311) Elapsed 23m 8s (remain 432m 50s) Loss: 0.0176(0.0706) Grad: 0.3370  
Epoch: [1][2620/51233] Data 0.312 (0.311) Elapsed 23m 19s (remain 432m 39s) Loss: 0.0086(0.0702) Grad: 0.1359  
Epoch: [1][2640/51233] Data 0.308 (0.311) Elapsed 23m 30s (remain 432m 28s) Loss: 0.1521(0.0704) Grad: 2.1917  
Epoch: [1][2660/51233] Data 0.316 (0.311) Elapsed 23m 40s (remain 432m 17s) Loss: 0.1388(0.0704) Grad: 1.1600  
Epoch: [1][2680/51233] Data 0.316 (0.311) Elapsed 23m 51s (remain 432m 6s) Loss: 0.0665(0.0704) Grad: 1.0232  
Epoch: [1][2700/51233] Data 0.313 (0.311) Elapsed 24m 2s (remain 431m 55s) Loss: 0.0392(0.0704) Grad: 0.5056  
Epoch: [1][2720/51233] Data 0.309 (0.311) Elapsed 24m 12s (remain 431m 43s) Loss: 0.0571(0.0702) Grad: 1.1692  
Epoch: [1][2740/51233] Data 0.309 (0.311) Elapsed 24m 23s (remain 431m 33s) Loss: 0.0131(0.0701) Grad: 0.1374  
Epoch: [1][2760/51233] Data 0.309 (0.311) Elapsed 24m 34s (remain 431m 21s) Loss: 0.0074(0.0700) Grad: 0.2109  
Epoch: [1][2780/51233] Data 0.309 (0.311) Elapsed 24m 44s (remain 431m 10s) Loss: 0.3379(0.0701) Grad: 1.0805  
Epoch: [1][2800/51233] Data 0.315 (0.311) Elapsed 24m 55s (remain 430m 59s) Loss: 0.0765(0.0703) Grad: 0.9419  
Epoch: [1][2820/51233] Data 0.317 (0.311) Elapsed 25m 6s (remain 430m 48s) Loss: 0.0595(0.0702) Grad: 0.7976  
Epoch: [1][2840/51233] Data 0.318 (0.311) Elapsed 25m 16s (remain 430m 37s) Loss: 0.1039(0.0700) Grad: 1.5492  
Epoch: [1][2860/51233] Data 0.309 (0.311) Elapsed 25m 27s (remain 430m 26s) Loss: 0.1316(0.0700) Grad: 1.4824  
Epoch: [1][2880/51233] Data 0.318 (0.311) Elapsed 25m 38s (remain 430m 15s) Loss: 0.1066(0.0700) Grad: 1.9514  
Epoch: [1][2900/51233] Data 0.318 (0.311) Elapsed 25m 48s (remain 430m 4s) Loss: 0.0192(0.0700) Grad: 0.2731  
Epoch: [1][2920/51233] Data 0.315 (0.311) Elapsed 25m 59s (remain 429m 53s) Loss: 0.0354(0.0700) Grad: 0.5521  
Epoch: [1][2940/51233] Data 0.305 (0.311) Elapsed 26m 10s (remain 429m 42s) Loss: 0.0246(0.0700) Grad: 0.5636  
Epoch: [1][2960/51233] Data 0.309 (0.311) Elapsed 26m 20s (remain 429m 31s) Loss: 0.0100(0.0698) Grad: 0.1106  
Epoch: [1][2980/51233] Data 0.318 (0.311) Elapsed 26m 31s (remain 429m 21s) Loss: 0.0791(0.0700) Grad: 0.8499  
Epoch: [1][3000/51233] Data 0.310 (0.311) Elapsed 26m 42s (remain 429m 9s) Loss: 0.0038(0.0699) Grad: 0.0483  
Epoch: [1][3020/51233] Data 0.310 (0.311) Elapsed 26m 52s (remain 428m 58s) Loss: 0.0139(0.0698) Grad: 0.1759  
Epoch: [1][3040/51233] Data 0.308 (0.311) Elapsed 27m 3s (remain 428m 47s) Loss: 0.0247(0.0695) Grad: 0.6742  
Epoch: [1][3060/51233] Data 0.309 (0.311) Elapsed 27m 14s (remain 428m 37s) Loss: 0.0870(0.0697) Grad: 2.0281  
Epoch: [1][3080/51233] Data 0.308 (0.311) Elapsed 27m 24s (remain 428m 26s) Loss: 0.1526(0.0696) Grad: 1.9662  
Epoch: [1][3100/51233] Data 0.311 (0.311) Elapsed 27m 35s (remain 428m 15s) Loss: 0.0050(0.0694) Grad: 0.0731  
Epoch: [1][3120/51233] Data 0.317 (0.311) Elapsed 27m 46s (remain 428m 4s) Loss: 0.0109(0.0693) Grad: 0.1922  
Epoch: [1][3140/51233] Data 0.312 (0.311) Elapsed 27m 56s (remain 427m 53s) Loss: 0.0013(0.0693) Grad: 0.0348  
Epoch: [1][3160/51233] Data 0.313 (0.311) Elapsed 28m 7s (remain 427m 42s) Loss: 0.0129(0.0692) Grad: 0.2403  
Epoch: [1][3180/51233] Data 0.312 (0.311) Elapsed 28m 18s (remain 427m 31s) Loss: 0.0904(0.0693) Grad: 2.2479  
Epoch: [1][3200/51233] Data 0.312 (0.311) Elapsed 28m 28s (remain 427m 20s) Loss: 0.0736(0.0692) Grad: 1.2996  
Epoch: [1][3220/51233] Data 0.312 (0.311) Elapsed 28m 39s (remain 427m 9s) Loss: 0.0049(0.0690) Grad: 0.0762  
Epoch: [1][3240/51233] Data 0.309 (0.311) Elapsed 28m 50s (remain 426m 58s) Loss: 0.0130(0.0689) Grad: 0.2768  
Epoch: [1][3260/51233] Data 0.308 (0.311) Elapsed 29m 0s (remain 426m 47s) Loss: 0.0059(0.0688) Grad: 0.1290  
Epoch: [1][3280/51233] Data 0.317 (0.311) Elapsed 29m 11s (remain 426m 36s) Loss: 0.3123(0.0689) Grad: 1.9170  
Epoch: [1][3300/51233] Data 0.317 (0.311) Elapsed 29m 22s (remain 426m 25s) Loss: 0.1028(0.0690) Grad: 0.5958  
Epoch: [1][3320/51233] Data 0.317 (0.311) Elapsed 29m 32s (remain 426m 14s) Loss: 0.0658(0.0691) Grad: 0.7371  
Epoch: [1][3340/51233] Data 0.312 (0.311) Elapsed 29m 43s (remain 426m 3s) Loss: 0.0117(0.0690) Grad: 0.1906  
Epoch: [1][3360/51233] Data 0.302 (0.311) Elapsed 29m 54s (remain 425m 52s) Loss: 0.0068(0.0689) Grad: 0.0951  
Epoch: [1][3380/51233] Data 0.312 (0.311) Elapsed 30m 4s (remain 425m 41s) Loss: 0.0039(0.0688) Grad: 0.0715  
Epoch: [1][3400/51233] Data 0.309 (0.311) Elapsed 30m 15s (remain 425m 30s) Loss: 0.0563(0.0687) Grad: 0.8894  
Epoch: [1][3420/51233] Data 0.308 (0.311) Elapsed 30m 26s (remain 425m 20s) Loss: 0.1286(0.0688) Grad: 1.5595  
Epoch: [1][3440/51233] Data 0.300 (0.311) Elapsed 30m 36s (remain 425m 9s) Loss: 0.0063(0.0688) Grad: 0.0887  
Epoch: [1][3460/51233] Data 0.309 (0.311) Elapsed 30m 47s (remain 424m 58s) Loss: 0.3089(0.0689) Grad: 2.1793  
Epoch: [1][3480/51233] Data 0.318 (0.311) Elapsed 30m 57s (remain 424m 47s) Loss: 0.0486(0.0689) Grad: 0.8810  
Epoch: [1][3500/51233] Data 0.313 (0.311) Elapsed 31m 8s (remain 424m 36s) Loss: 0.0248(0.0690) Grad: 0.3900  
Epoch: [1][3520/51233] Data 0.308 (0.311) Elapsed 31m 19s (remain 424m 25s) Loss: 0.1162(0.0692) Grad: 1.5744  
Epoch: [1][3540/51233] Data 0.310 (0.311) Elapsed 31m 29s (remain 424m 14s) Loss: 0.0227(0.0692) Grad: 0.2103  
Epoch: [1][3560/51233] Data 0.318 (0.311) Elapsed 31m 40s (remain 424m 3s) Loss: 0.0814(0.0691) Grad: 1.6094  
Epoch: [1][3580/51233] Data 0.308 (0.311) Elapsed 31m 51s (remain 423m 53s) Loss: 0.0270(0.0691) Grad: 0.3231  
Epoch: [1][3600/51233] Data 0.307 (0.311) Elapsed 32m 1s (remain 423m 42s) Loss: 0.0108(0.0691) Grad: 0.1301  
Epoch: [1][3620/51233] Data 0.309 (0.311) Elapsed 32m 12s (remain 423m 31s) Loss: 0.0181(0.0689) Grad: 0.3569  
Epoch: [1][3640/51233] Data 0.318 (0.311) Elapsed 32m 23s (remain 423m 20s) Loss: 0.0036(0.0687) Grad: 0.0498  
Epoch: [1][3660/51233] Data 0.311 (0.311) Elapsed 32m 33s (remain 423m 9s) Loss: 0.0563(0.0688) Grad: 0.9376  
Epoch: [1][3680/51233] Data 0.301 (0.311) Elapsed 32m 44s (remain 422m 58s) Loss: 0.2840(0.0688) Grad: 2.1132  
Epoch: [1][3700/51233] Data 0.296 (0.311) Elapsed 32m 55s (remain 422m 47s) Loss: 0.0801(0.0687) Grad: 1.2856  
Epoch: [1][3720/51233] Data 0.295 (0.311) Elapsed 33m 5s (remain 422m 36s) Loss: 0.0406(0.0686) Grad: 0.5563  
Epoch: [1][3740/51233] Data 0.309 (0.311) Elapsed 33m 16s (remain 422m 26s) Loss: 0.1422(0.0687) Grad: 1.3242  
Epoch: [1][3760/51233] Data 0.313 (0.311) Elapsed 33m 27s (remain 422m 15s) Loss: 0.0219(0.0686) Grad: 0.3241  
Epoch: [1][3780/51233] Data 0.312 (0.311) Elapsed 33m 37s (remain 422m 4s) Loss: 0.0044(0.0685) Grad: 0.0717  
Epoch: [1][3800/51233] Data 0.310 (0.311) Elapsed 33m 48s (remain 421m 53s) Loss: 0.0018(0.0684) Grad: 0.0210  
Epoch: [1][3820/51233] Data 0.306 (0.311) Elapsed 33m 59s (remain 421m 42s) Loss: 0.1055(0.0683) Grad: 1.2664  
Epoch: [1][3840/51233] Data 0.309 (0.311) Elapsed 34m 9s (remain 421m 31s) Loss: 0.0261(0.0683) Grad: 0.4103  
Epoch: [1][3860/51233] Data 0.314 (0.311) Elapsed 34m 20s (remain 421m 21s) Loss: 0.0062(0.0682) Grad: 0.1144  
Epoch: [1][3880/51233] Data 0.308 (0.311) Elapsed 34m 31s (remain 421m 10s) Loss: 0.0037(0.0681) Grad: 0.0555  
Epoch: [1][3900/51233] Data 0.299 (0.311) Elapsed 34m 41s (remain 420m 59s) Loss: 0.0054(0.0682) Grad: 0.0864  
Epoch: [1][3920/51233] Data 0.318 (0.311) Elapsed 34m 52s (remain 420m 48s) Loss: 0.0015(0.0680) Grad: 0.0281  
Epoch: [1][3940/51233] Data 0.307 (0.311) Elapsed 35m 3s (remain 420m 37s) Loss: 0.1245(0.0680) Grad: 1.8972  
Epoch: [1][3960/51233] Data 0.316 (0.311) Elapsed 35m 13s (remain 420m 26s) Loss: 0.0067(0.0679) Grad: 0.0968  
Epoch: [1][3980/51233] Data 0.317 (0.311) Elapsed 35m 24s (remain 420m 15s) Loss: 0.0138(0.0680) Grad: 0.2539  
Epoch: [1][4000/51233] Data 0.307 (0.311) Elapsed 35m 35s (remain 420m 5s) Loss: 0.0005(0.0679) Grad: 0.0074  
Epoch: [1][4020/51233] Data 0.295 (0.311) Elapsed 35m 45s (remain 419m 54s) Loss: 0.1084(0.0679) Grad: 0.8729  
Epoch: [1][4040/51233] Data 0.313 (0.311) Elapsed 35m 56s (remain 419m 43s) Loss: 0.0364(0.0680) Grad: 0.8578  
Epoch: [1][4060/51233] Data 0.308 (0.311) Elapsed 36m 7s (remain 419m 32s) Loss: 0.0288(0.0680) Grad: 0.5536  
Epoch: [1][4080/51233] Data 0.308 (0.311) Elapsed 36m 17s (remain 419m 21s) Loss: 0.0366(0.0680) Grad: 0.5273  
Epoch: [1][4100/51233] Data 0.310 (0.311) Elapsed 36m 28s (remain 419m 11s) Loss: 0.5868(0.0680) Grad: 4.4933  
Epoch: [1][4120/51233] Data 0.317 (0.311) Elapsed 36m 39s (remain 419m 0s) Loss: 0.2902(0.0679) Grad: 3.0714  
Epoch: [1][4140/51233] Data 0.308 (0.311) Elapsed 36m 49s (remain 418m 49s) Loss: 0.0674(0.0681) Grad: 1.3826  
Epoch: [1][4160/51233] Data 0.311 (0.311) Elapsed 37m 0s (remain 418m 38s) Loss: 0.0375(0.0682) Grad: 0.7225  
Epoch: [1][4180/51233] Data 0.312 (0.311) Elapsed 37m 11s (remain 418m 27s) Loss: 0.1604(0.0683) Grad: 2.3890  
Epoch: [1][4200/51233] Data 0.311 (0.311) Elapsed 37m 21s (remain 418m 16s) Loss: 0.0093(0.0682) Grad: 0.0974  
Epoch: [1][4220/51233] Data 0.317 (0.311) Elapsed 37m 32s (remain 418m 6s) Loss: 0.0087(0.0681) Grad: 0.1730  
Epoch: [1][4240/51233] Data 0.317 (0.311) Elapsed 37m 43s (remain 417m 55s) Loss: 0.0340(0.0681) Grad: 0.3409  
Epoch: [1][4260/51233] Data 0.318 (0.311) Elapsed 37m 53s (remain 417m 44s) Loss: 0.0041(0.0680) Grad: 0.0548  
Epoch: [1][4280/51233] Data 0.310 (0.311) Elapsed 38m 4s (remain 417m 33s) Loss: 0.0066(0.0680) Grad: 0.0894  
Epoch: [1][4300/51233] Data 0.298 (0.311) Elapsed 38m 14s (remain 417m 22s) Loss: 0.0568(0.0681) Grad: 0.5990  
Epoch: [1][4320/51233] Data 0.304 (0.311) Elapsed 38m 25s (remain 417m 11s) Loss: 0.0050(0.0680) Grad: 0.0772  
Epoch: [1][4340/51233] Data 0.308 (0.311) Elapsed 38m 36s (remain 417m 1s) Loss: 0.0149(0.0680) Grad: 0.1644  
Epoch: [1][4360/51233] Data 0.318 (0.311) Elapsed 38m 46s (remain 416m 50s) Loss: 0.0698(0.0680) Grad: 1.9853  
Epoch: [1][4380/51233] Data 0.317 (0.311) Elapsed 38m 57s (remain 416m 39s) Loss: 0.1943(0.0681) Grad: 1.8015  
Epoch: [1][4400/51233] Data 0.304 (0.311) Elapsed 39m 8s (remain 416m 28s) Loss: 0.0097(0.0681) Grad: 0.1177  
Epoch: [1][4420/51233] Data 0.309 (0.311) Elapsed 39m 18s (remain 416m 17s) Loss: 0.0120(0.0683) Grad: 0.1214  
Epoch: [1][4440/51233] Data 0.318 (0.311) Elapsed 39m 29s (remain 416m 6s) Loss: 0.1423(0.0683) Grad: 1.4117  
Epoch: [1][4460/51233] Data 0.297 (0.311) Elapsed 39m 40s (remain 415m 56s) Loss: 0.0189(0.0682) Grad: 0.4412  
Epoch: [1][4480/51233] Data 0.316 (0.311) Elapsed 39m 50s (remain 415m 45s) Loss: 0.0741(0.0681) Grad: 1.2000  
Epoch: [1][4500/51233] Data 0.308 (0.311) Elapsed 40m 1s (remain 415m 34s) Loss: 0.0066(0.0681) Grad: 0.0738  
Epoch: [1][4520/51233] Data 0.314 (0.311) Elapsed 40m 12s (remain 415m 23s) Loss: 0.0023(0.0680) Grad: 0.0735  
Epoch: [1][4540/51233] Data 0.317 (0.311) Elapsed 40m 22s (remain 415m 12s) Loss: 0.0154(0.0681) Grad: 0.4120  
Epoch: [1][4560/51233] Data 0.297 (0.311) Elapsed 40m 33s (remain 415m 2s) Loss: 0.2144(0.0683) Grad: 1.3681  
Epoch: [1][4580/51233] Data 0.312 (0.311) Elapsed 40m 44s (remain 414m 51s) Loss: 0.0555(0.0682) Grad: 0.5855  
Epoch: [1][4600/51233] Data 0.315 (0.311) Elapsed 40m 54s (remain 414m 40s) Loss: 0.1213(0.0683) Grad: 2.8220  
Epoch: [1][4620/51233] Data 0.309 (0.311) Elapsed 41m 5s (remain 414m 29s) Loss: 0.1168(0.0682) Grad: 1.8780  
Epoch: [1][4640/51233] Data 0.305 (0.311) Elapsed 41m 16s (remain 414m 19s) Loss: 0.0684(0.0681) Grad: 1.2661  
Epoch: [1][4660/51233] Data 0.312 (0.311) Elapsed 41m 26s (remain 414m 8s) Loss: 0.0727(0.0682) Grad: 0.7885  
Epoch: [1][4680/51233] Data 0.307 (0.311) Elapsed 41m 37s (remain 413m 57s) Loss: 0.0289(0.0682) Grad: 0.4160  
Epoch: [1][4700/51233] Data 0.297 (0.311) Elapsed 41m 48s (remain 413m 46s) Loss: 0.0099(0.0683) Grad: 0.2047  
Epoch: [1][4720/51233] Data 0.318 (0.311) Elapsed 41m 58s (remain 413m 35s) Loss: 0.0080(0.0682) Grad: 0.1395  
Epoch: [1][4740/51233] Data 0.304 (0.311) Elapsed 42m 9s (remain 413m 25s) Loss: 0.0016(0.0680) Grad: 0.0187  
Epoch: [1][4760/51233] Data 0.305 (0.311) Elapsed 42m 20s (remain 413m 14s) Loss: 0.0130(0.0679) Grad: 0.2331  
Epoch: [1][4780/51233] Data 0.318 (0.311) Elapsed 42m 30s (remain 413m 3s) Loss: 0.0179(0.0677) Grad: 0.6396  
Epoch: [1][4800/51233] Data 0.309 (0.311) Elapsed 42m 41s (remain 412m 52s) Loss: 0.0217(0.0677) Grad: 0.4579  
Epoch: [1][4820/51233] Data 0.310 (0.311) Elapsed 42m 52s (remain 412m 41s) Loss: 0.0095(0.0677) Grad: 0.2299  
Epoch: [1][4840/51233] Data 0.308 (0.311) Elapsed 43m 2s (remain 412m 31s) Loss: 0.4197(0.0677) Grad: 2.3635  
Epoch: [1][4860/51233] Data 0.306 (0.311) Elapsed 43m 13s (remain 412m 20s) Loss: 0.0048(0.0679) Grad: 0.0522  
Epoch: [1][4880/51233] Data 0.306 (0.311) Elapsed 43m 24s (remain 412m 9s) Loss: 0.0547(0.0679) Grad: 0.7952  
Epoch: [1][4900/51233] Data 0.317 (0.311) Elapsed 43m 34s (remain 411m 58s) Loss: 0.0249(0.0679) Grad: 0.3528  
Epoch: [1][4920/51233] Data 0.317 (0.311) Elapsed 43m 45s (remain 411m 48s) Loss: 0.0056(0.0679) Grad: 0.0755  
Epoch: [1][4940/51233] Data 0.306 (0.311) Elapsed 43m 56s (remain 411m 37s) Loss: 0.2843(0.0679) Grad: 1.4976  
Epoch: [1][4960/51233] Data 0.310 (0.311) Elapsed 44m 6s (remain 411m 26s) Loss: 0.1408(0.0679) Grad: 1.7796  
Epoch: [1][4980/51233] Data 0.310 (0.311) Elapsed 44m 17s (remain 411m 15s) Loss: 0.0045(0.0679) Grad: 0.0574  
Epoch: [1][5000/51233] Data 0.309 (0.311) Elapsed 44m 28s (remain 411m 4s) Loss: 0.0547(0.0680) Grad: 0.6459  
Epoch: [1][5020/51233] Data 0.318 (0.311) Elapsed 44m 38s (remain 410m 54s) Loss: 0.0028(0.0681) Grad: 0.0328  
Epoch: [1][5040/51233] Data 0.309 (0.311) Elapsed 44m 49s (remain 410m 43s) Loss: 0.0978(0.0682) Grad: 1.5925  
Epoch: [1][5060/51233] Data 0.300 (0.311) Elapsed 45m 0s (remain 410m 32s) Loss: 0.0376(0.0683) Grad: 0.6595  
Epoch: [1][5080/51233] Data 0.303 (0.311) Elapsed 45m 10s (remain 410m 21s) Loss: 0.0038(0.0682) Grad: 0.0786  
Epoch: [1][5100/51233] Data 0.310 (0.311) Elapsed 45m 21s (remain 410m 11s) Loss: 0.0195(0.0682) Grad: 0.2466  
Epoch: [1][5120/51233] Data 0.313 (0.311) Elapsed 45m 32s (remain 410m 0s) Loss: 0.0332(0.0682) Grad: 0.4123  
Epoch: [1][5140/51233] Data 0.308 (0.311) Elapsed 45m 42s (remain 409m 49s) Loss: 0.0122(0.0681) Grad: 0.2099  
Epoch: [1][5160/51233] Data 0.300 (0.311) Elapsed 45m 53s (remain 409m 39s) Loss: 0.2381(0.0680) Grad: 1.7605  
Epoch: [1][5180/51233] Data 0.317 (0.311) Elapsed 46m 4s (remain 409m 28s) Loss: 0.2036(0.0681) Grad: 1.4274  
Epoch: [1][5200/51233] Data 0.303 (0.311) Elapsed 46m 14s (remain 409m 17s) Loss: 0.0224(0.0680) Grad: 0.5013  
Epoch: [1][5220/51233] Data 0.308 (0.311) Elapsed 46m 25s (remain 409m 6s) Loss: 0.0017(0.0680) Grad: 0.0172  
Epoch: [1][5240/51233] Data 0.315 (0.311) Elapsed 46m 35s (remain 408m 55s) Loss: 0.0114(0.0681) Grad: 0.1623  
Epoch: [1][5260/51233] Data 0.304 (0.311) Elapsed 46m 46s (remain 408m 45s) Loss: 0.0670(0.0680) Grad: 0.6285  
Epoch: [1][5280/51233] Data 0.317 (0.311) Elapsed 46m 57s (remain 408m 34s) Loss: 0.0417(0.0681) Grad: 0.6438  
Epoch: [1][5300/51233] Data 0.314 (0.311) Elapsed 47m 7s (remain 408m 23s) Loss: 0.1008(0.0681) Grad: 1.4676  
Epoch: [1][5320/51233] Data 0.305 (0.311) Elapsed 47m 18s (remain 408m 12s) Loss: 0.2661(0.0681) Grad: 2.7992  
Epoch: [1][5340/51233] Data 0.294 (0.311) Elapsed 47m 29s (remain 408m 2s) Loss: 0.0338(0.0681) Grad: 0.7576  
Epoch: [1][5360/51233] Data 0.318 (0.311) Elapsed 47m 39s (remain 407m 51s) Loss: 0.0165(0.0681) Grad: 0.4204  
Epoch: [1][5380/51233] Data 0.311 (0.311) Elapsed 47m 50s (remain 407m 40s) Loss: 0.0237(0.0682) Grad: 0.2831  
Epoch: [1][5400/51233] Data 0.309 (0.311) Elapsed 48m 1s (remain 407m 29s) Loss: 0.0235(0.0681) Grad: 0.3242  
Epoch: [1][5420/51233] Data 0.302 (0.311) Elapsed 48m 11s (remain 407m 18s) Loss: 0.0015(0.0681) Grad: 0.0132  
Epoch: [1][5440/51233] Data 0.292 (0.311) Elapsed 48m 22s (remain 407m 8s) Loss: 0.0183(0.0680) Grad: 0.2685  
Epoch: [1][5460/51233] Data 0.309 (0.311) Elapsed 48m 33s (remain 406m 57s) Loss: 0.1704(0.0680) Grad: 2.1164  
Epoch: [1][5480/51233] Data 0.303 (0.311) Elapsed 48m 43s (remain 406m 46s) Loss: 0.0934(0.0680) Grad: 1.5449  
Epoch: [1][5500/51233] Data 0.307 (0.311) Elapsed 48m 54s (remain 406m 36s) Loss: 0.0723(0.0680) Grad: 1.2413  
Epoch: [1][5520/51233] Data 0.309 (0.311) Elapsed 49m 5s (remain 406m 25s) Loss: 0.0206(0.0679) Grad: 0.3214  
Epoch: [1][5540/51233] Data 0.318 (0.311) Elapsed 49m 15s (remain 406m 14s) Loss: 0.0105(0.0683) Grad: 0.2330  
Epoch: [1][5560/51233] Data 0.307 (0.311) Elapsed 49m 26s (remain 406m 3s) Loss: 0.1503(0.0684) Grad: 1.4531  
Epoch: [1][5580/51233] Data 0.295 (0.311) Elapsed 49m 37s (remain 405m 53s) Loss: 0.0555(0.0685) Grad: 0.5486  
Epoch: [1][5600/51233] Data 0.309 (0.311) Elapsed 49m 47s (remain 405m 42s) Loss: 0.0258(0.0685) Grad: 0.4278  
Epoch: [1][5620/51233] Data 0.308 (0.311) Elapsed 49m 58s (remain 405m 31s) Loss: 0.0161(0.0686) Grad: 0.2057  
Epoch: [1][5640/51233] Data 0.318 (0.311) Elapsed 50m 9s (remain 405m 20s) Loss: 0.0154(0.0687) Grad: 0.1920  
Epoch: [1][5660/51233] Data 0.307 (0.311) Elapsed 50m 19s (remain 405m 10s) Loss: 0.1486(0.0687) Grad: 1.4654  
Epoch: [1][5680/51233] Data 0.309 (0.311) Elapsed 50m 30s (remain 404m 59s) Loss: 0.1262(0.0687) Grad: 1.4896  
Epoch: [1][5700/51233] Data 0.315 (0.311) Elapsed 50m 41s (remain 404m 48s) Loss: 0.0298(0.0688) Grad: 0.4930  
Epoch: [1][5720/51233] Data 0.298 (0.311) Elapsed 50m 51s (remain 404m 38s) Loss: 0.0162(0.0688) Grad: 0.2565  
Epoch: [1][5740/51233] Data 0.309 (0.311) Elapsed 51m 2s (remain 404m 27s) Loss: 0.0161(0.0687) Grad: 0.1861  
Epoch: [1][5760/51233] Data 0.308 (0.311) Elapsed 51m 13s (remain 404m 16s) Loss: 0.0767(0.0686) Grad: 0.9586  
Epoch: [1][5780/51233] Data 0.318 (0.311) Elapsed 51m 23s (remain 404m 5s) Loss: 0.0025(0.0686) Grad: 0.0440  
Epoch: [1][5800/51233] Data 0.318 (0.311) Elapsed 51m 34s (remain 403m 55s) Loss: 0.0274(0.0686) Grad: 0.7668  
Epoch: [1][5820/51233] Data 0.318 (0.311) Elapsed 51m 45s (remain 403m 44s) Loss: 0.0210(0.0686) Grad: 0.5797  
Epoch: [1][5840/51233] Data 0.309 (0.311) Elapsed 51m 55s (remain 403m 33s) Loss: 0.0098(0.0685) Grad: 0.1283  
Epoch: [1][5860/51233] Data 0.318 (0.311) Elapsed 52m 6s (remain 403m 22s) Loss: 0.0106(0.0685) Grad: 0.2298  
Epoch: [1][5880/51233] Data 0.306 (0.311) Elapsed 52m 17s (remain 403m 12s) Loss: 0.0461(0.0684) Grad: 0.5678  
Epoch: [1][5900/51233] Data 0.317 (0.311) Elapsed 52m 27s (remain 403m 1s) Loss: 0.1385(0.0683) Grad: 1.3096  
Epoch: [1][5920/51233] Data 0.310 (0.311) Elapsed 52m 38s (remain 402m 50s) Loss: 0.0600(0.0684) Grad: 1.0837  
Epoch: [1][5940/51233] Data 0.317 (0.311) Elapsed 52m 49s (remain 402m 39s) Loss: 0.0111(0.0684) Grad: 0.1671  
Epoch: [1][5960/51233] Data 0.309 (0.311) Elapsed 52m 59s (remain 402m 29s) Loss: 0.0364(0.0684) Grad: 0.8778  
Epoch: [1][5980/51233] Data 0.309 (0.311) Elapsed 53m 10s (remain 402m 18s) Loss: 0.1465(0.0684) Grad: 1.2914  
Epoch: [1][6000/51233] Data 0.317 (0.311) Elapsed 53m 21s (remain 402m 7s) Loss: 0.1486(0.0685) Grad: 1.3376  
Epoch: [1][6020/51233] Data 0.301 (0.311) Elapsed 53m 31s (remain 401m 56s) Loss: 0.0109(0.0686) Grad: 0.1631  
Epoch: [1][6040/51233] Data 0.312 (0.311) Elapsed 53m 42s (remain 401m 46s) Loss: 0.0368(0.0686) Grad: 0.3921  
Epoch: [1][6060/51233] Data 0.317 (0.311) Elapsed 53m 53s (remain 401m 35s) Loss: 0.0219(0.0686) Grad: 0.5057  
Epoch: [1][6080/51233] Data 0.317 (0.311) Elapsed 54m 3s (remain 401m 24s) Loss: 0.0154(0.0687) Grad: 0.2135  
Epoch: [1][6100/51233] Data 0.314 (0.311) Elapsed 54m 14s (remain 401m 14s) Loss: 0.0150(0.0687) Grad: 0.2775  
Epoch: [1][6120/51233] Data 0.307 (0.311) Elapsed 54m 25s (remain 401m 3s) Loss: 0.0239(0.0687) Grad: 0.3004  
Epoch: [1][6140/51233] Data 0.311 (0.311) Elapsed 54m 35s (remain 400m 52s) Loss: 0.0166(0.0686) Grad: 0.2618  
Epoch: [1][6160/51233] Data 0.318 (0.311) Elapsed 54m 46s (remain 400m 41s) Loss: 0.0177(0.0687) Grad: 0.2616  
Epoch: [1][6180/51233] Data 0.310 (0.311) Elapsed 54m 56s (remain 400m 31s) Loss: 0.0107(0.0686) Grad: 0.1652  
Epoch: [1][6200/51233] Data 0.309 (0.311) Elapsed 55m 7s (remain 400m 20s) Loss: 0.0388(0.0687) Grad: 0.6044  
Epoch: [1][6220/51233] Data 0.315 (0.311) Elapsed 55m 18s (remain 400m 9s) Loss: 0.0202(0.0686) Grad: 0.2181  
Epoch: [1][6240/51233] Data 0.309 (0.311) Elapsed 55m 28s (remain 399m 58s) Loss: 0.0204(0.0687) Grad: 0.4124  
Epoch: [1][6260/51233] Data 0.317 (0.311) Elapsed 55m 39s (remain 399m 48s) Loss: 0.0539(0.0686) Grad: 0.8859  
Epoch: [1][6280/51233] Data 0.292 (0.311) Elapsed 55m 50s (remain 399m 37s) Loss: 0.0023(0.0686) Grad: 0.0520  
Epoch: [1][6300/51233] Data 0.309 (0.311) Elapsed 56m 0s (remain 399m 26s) Loss: 0.0566(0.0686) Grad: 0.8152  
Epoch: [1][6320/51233] Data 0.314 (0.311) Elapsed 56m 11s (remain 399m 15s) Loss: 0.0857(0.0687) Grad: 1.2000  
Epoch: [1][6340/51233] Data 0.296 (0.311) Elapsed 56m 22s (remain 399m 5s) Loss: 0.0819(0.0686) Grad: 1.6807  
Epoch: [1][6360/51233] Data 0.317 (0.311) Elapsed 56m 32s (remain 398m 54s) Loss: 0.0284(0.0686) Grad: 0.5477  
Epoch: [1][6380/51233] Data 0.317 (0.311) Elapsed 56m 43s (remain 398m 43s) Loss: 0.0142(0.0686) Grad: 0.3353  
Epoch: [1][6400/51233] Data 0.317 (0.311) Elapsed 56m 54s (remain 398m 32s) Loss: 0.0317(0.0685) Grad: 0.4201  
Epoch: [1][6420/51233] Data 0.316 (0.311) Elapsed 57m 4s (remain 398m 22s) Loss: 0.0599(0.0685) Grad: 1.2562  
Epoch: [1][6440/51233] Data 0.312 (0.311) Elapsed 57m 15s (remain 398m 11s) Loss: 0.0114(0.0685) Grad: 0.1978  
Epoch: [1][6460/51233] Data 0.309 (0.311) Elapsed 57m 26s (remain 398m 0s) Loss: 0.0054(0.0685) Grad: 0.0925  
Epoch: [1][6480/51233] Data 0.308 (0.311) Elapsed 57m 36s (remain 397m 50s) Loss: 0.0088(0.0685) Grad: 0.2106  
Epoch: [1][6500/51233] Data 0.308 (0.311) Elapsed 57m 47s (remain 397m 39s) Loss: 0.0018(0.0685) Grad: 0.0285  
Epoch: [1][6520/51233] Data 0.309 (0.311) Elapsed 57m 58s (remain 397m 28s) Loss: 0.0051(0.0686) Grad: 0.0674  
Epoch: [1][6540/51233] Data 0.309 (0.311) Elapsed 58m 8s (remain 397m 17s) Loss: 0.0829(0.0687) Grad: 0.7567  
Epoch: [1][6560/51233] Data 0.310 (0.311) Elapsed 58m 19s (remain 397m 7s) Loss: 0.0479(0.0687) Grad: 0.8465  
Epoch: [1][6580/51233] Data 0.295 (0.311) Elapsed 58m 30s (remain 396m 56s) Loss: 0.0125(0.0686) Grad: 0.3253  
Epoch: [1][6600/51233] Data 0.314 (0.311) Elapsed 58m 40s (remain 396m 45s) Loss: 0.0359(0.0686) Grad: 0.6208  
Epoch: [1][6620/51233] Data 0.318 (0.311) Elapsed 58m 51s (remain 396m 35s) Loss: 0.0574(0.0686) Grad: 1.2138  
Epoch: [1][6640/51233] Data 0.315 (0.311) Elapsed 59m 2s (remain 396m 24s) Loss: 0.0929(0.0686) Grad: 1.9339  
Epoch: [1][6660/51233] Data 0.304 (0.311) Elapsed 59m 12s (remain 396m 13s) Loss: 0.0070(0.0687) Grad: 0.0763  
Epoch: [1][6680/51233] Data 0.315 (0.311) Elapsed 59m 23s (remain 396m 2s) Loss: 0.1837(0.0688) Grad: 1.1002  
Epoch: [1][6700/51233] Data 0.306 (0.311) Elapsed 59m 34s (remain 395m 52s) Loss: 0.1461(0.0687) Grad: 1.1074  
Epoch: [1][6720/51233] Data 0.310 (0.311) Elapsed 59m 44s (remain 395m 41s) Loss: 0.3068(0.0687) Grad: 1.3688  
Epoch: [1][6740/51233] Data 0.309 (0.311) Elapsed 59m 55s (remain 395m 30s) Loss: 0.2174(0.0687) Grad: 2.1383  
Epoch: [1][6760/51233] Data 0.311 (0.311) Elapsed 60m 6s (remain 395m 20s) Loss: 0.0116(0.0687) Grad: 0.1335  
Epoch: [1][6780/51233] Data 0.309 (0.311) Elapsed 60m 16s (remain 395m 9s) Loss: 0.0471(0.0687) Grad: 0.5366  
Epoch: [1][6800/51233] Data 0.317 (0.311) Elapsed 60m 27s (remain 394m 58s) Loss: 0.0744(0.0688) Grad: 0.8719  
Epoch: [1][6820/51233] Data 0.313 (0.311) Elapsed 60m 38s (remain 394m 47s) Loss: 0.0274(0.0688) Grad: 0.5146  
Epoch: [1][6840/51233] Data 0.317 (0.311) Elapsed 60m 48s (remain 394m 37s) Loss: 0.0101(0.0688) Grad: 0.1774  
Epoch: [1][6860/51233] Data 0.318 (0.311) Elapsed 60m 59s (remain 394m 26s) Loss: 0.1043(0.0688) Grad: 1.7207  
Epoch: [1][6880/51233] Data 0.306 (0.311) Elapsed 61m 10s (remain 394m 15s) Loss: 0.0552(0.0687) Grad: 1.1269  
Epoch: [1][6900/51233] Data 0.308 (0.311) Elapsed 61m 20s (remain 394m 4s) Loss: 0.1523(0.0687) Grad: 2.7271  
Epoch: [1][6920/51233] Data 0.308 (0.311) Elapsed 61m 31s (remain 393m 54s) Loss: 0.0164(0.0686) Grad: 0.2459  
Epoch: [1][6940/51233] Data 0.313 (0.311) Elapsed 61m 42s (remain 393m 43s) Loss: 0.0286(0.0686) Grad: 0.4416  
Epoch: [1][6960/51233] Data 0.318 (0.311) Elapsed 61m 52s (remain 393m 32s) Loss: 0.2001(0.0685) Grad: 2.1036  
Epoch: [1][6980/51233] Data 0.313 (0.311) Elapsed 62m 3s (remain 393m 22s) Loss: 0.2731(0.0686) Grad: 1.4163  
Epoch: [1][7000/51233] Data 0.317 (0.311) Elapsed 62m 14s (remain 393m 11s) Loss: 0.1214(0.0686) Grad: 0.8141  
Epoch: [1][7020/51233] Data 0.311 (0.311) Elapsed 62m 24s (remain 393m 0s) Loss: 0.0303(0.0685) Grad: 0.6521  
Epoch: [1][7040/51233] Data 0.309 (0.311) Elapsed 62m 35s (remain 392m 49s) Loss: 0.0207(0.0685) Grad: 0.4766  
Epoch: [1][7060/51233] Data 0.318 (0.311) Elapsed 62m 46s (remain 392m 39s) Loss: 0.0140(0.0684) Grad: 0.2252  
Epoch: [1][7080/51233] Data 0.311 (0.311) Elapsed 62m 56s (remain 392m 28s) Loss: 0.1488(0.0684) Grad: 1.1370  
Epoch: [1][7100/51233] Data 0.317 (0.311) Elapsed 63m 7s (remain 392m 17s) Loss: 0.0278(0.0685) Grad: 0.6711  
Epoch: [1][7120/51233] Data 0.308 (0.311) Elapsed 63m 17s (remain 392m 7s) Loss: 0.0011(0.0685) Grad: 0.0150  
Epoch: [1][7140/51233] Data 0.297 (0.311) Elapsed 63m 28s (remain 391m 56s) Loss: 0.0324(0.0686) Grad: 0.3614  
Epoch: [1][7160/51233] Data 0.309 (0.311) Elapsed 63m 39s (remain 391m 45s) Loss: 0.0363(0.0685) Grad: 0.5645  
Epoch: [1][7180/51233] Data 0.306 (0.311) Elapsed 63m 49s (remain 391m 34s) Loss: 0.0176(0.0685) Grad: 0.2329  
Epoch: [1][7200/51233] Data 0.307 (0.311) Elapsed 64m 0s (remain 391m 24s) Loss: 0.0551(0.0684) Grad: 1.0733  
Epoch: [1][7220/51233] Data 0.307 (0.311) Elapsed 64m 11s (remain 391m 13s) Loss: 0.3155(0.0685) Grad: 2.6865  
Epoch: [1][7240/51233] Data 0.317 (0.311) Elapsed 64m 21s (remain 391m 2s) Loss: 0.0032(0.0684) Grad: 0.0530  
Epoch: [1][7260/51233] Data 0.308 (0.311) Elapsed 64m 32s (remain 390m 52s) Loss: 0.2819(0.0683) Grad: 3.2604  
Epoch: [1][7280/51233] Data 0.311 (0.311) Elapsed 64m 43s (remain 390m 41s) Loss: 0.2988(0.0683) Grad: 2.0788  
Epoch: [1][7300/51233] Data 0.307 (0.311) Elapsed 64m 53s (remain 390m 30s) Loss: 0.0172(0.0683) Grad: 0.2330  
Epoch: [1][7320/51233] Data 0.306 (0.311) Elapsed 65m 4s (remain 390m 19s) Loss: 0.0769(0.0682) Grad: 1.2559  
Epoch: [1][7340/51233] Data 0.305 (0.311) Elapsed 65m 15s (remain 390m 9s) Loss: 0.0149(0.0681) Grad: 0.2304  
Epoch: [1][7360/51233] Data 0.317 (0.311) Elapsed 65m 25s (remain 389m 58s) Loss: 0.0387(0.0681) Grad: 0.5899  
Epoch: [1][7380/51233] Data 0.317 (0.311) Elapsed 65m 36s (remain 389m 47s) Loss: 0.1004(0.0682) Grad: 1.7106  
Epoch: [1][7400/51233] Data 0.317 (0.311) Elapsed 65m 47s (remain 389m 37s) Loss: 0.0847(0.0681) Grad: 1.2798  
Epoch: [1][7420/51233] Data 0.308 (0.311) Elapsed 65m 57s (remain 389m 26s) Loss: 0.1444(0.0681) Grad: 1.7427  
Epoch: [1][7440/51233] Data 0.311 (0.311) Elapsed 66m 8s (remain 389m 15s) Loss: 0.0102(0.0681) Grad: 0.1569  
Epoch: [1][7460/51233] Data 0.304 (0.311) Elapsed 66m 19s (remain 389m 4s) Loss: 0.0185(0.0680) Grad: 0.4127  
Epoch: [1][7480/51233] Data 0.313 (0.311) Elapsed 66m 29s (remain 388m 54s) Loss: 0.2439(0.0679) Grad: 2.6748  
Epoch: [1][7500/51233] Data 0.309 (0.311) Elapsed 66m 40s (remain 388m 43s) Loss: 0.0054(0.0679) Grad: 0.1114  
Epoch: [1][7520/51233] Data 0.318 (0.311) Elapsed 66m 51s (remain 388m 32s) Loss: 0.0026(0.0680) Grad: 0.0594  
Epoch: [1][7540/51233] Data 0.297 (0.311) Elapsed 67m 1s (remain 388m 22s) Loss: 0.0068(0.0680) Grad: 0.2215  
Epoch: [1][7560/51233] Data 0.318 (0.311) Elapsed 67m 12s (remain 388m 11s) Loss: 0.0579(0.0680) Grad: 0.7076  
Epoch: [1][7580/51233] Data 0.307 (0.311) Elapsed 67m 23s (remain 388m 0s) Loss: 0.0522(0.0680) Grad: 0.6385  
Epoch: [1][7600/51233] Data 0.308 (0.311) Elapsed 67m 33s (remain 387m 50s) Loss: 0.0174(0.0680) Grad: 0.2922  
Epoch: [1][7620/51233] Data 0.314 (0.311) Elapsed 67m 44s (remain 387m 39s) Loss: 0.0173(0.0679) Grad: 0.3099  
Epoch: [1][7640/51233] Data 0.307 (0.311) Elapsed 67m 55s (remain 387m 28s) Loss: 0.0300(0.0680) Grad: 0.4924  
Epoch: [1][7660/51233] Data 0.309 (0.311) Elapsed 68m 5s (remain 387m 17s) Loss: 0.0026(0.0679) Grad: 0.0497  
Epoch: [1][7680/51233] Data 0.308 (0.311) Elapsed 68m 16s (remain 387m 7s) Loss: 0.0067(0.0678) Grad: 0.1759  
Epoch: [1][7700/51233] Data 0.311 (0.311) Elapsed 68m 27s (remain 386m 56s) Loss: 0.2886(0.0678) Grad: 2.0439  
Epoch: [1][7720/51233] Data 0.310 (0.311) Elapsed 68m 37s (remain 386m 45s) Loss: 0.0328(0.0679) Grad: 0.5148  
Epoch: [1][7740/51233] Data 0.298 (0.311) Elapsed 68m 48s (remain 386m 35s) Loss: 0.1523(0.0679) Grad: 1.5010  
Epoch: [1][7760/51233] Data 0.309 (0.311) Elapsed 68m 59s (remain 386m 24s) Loss: 0.0225(0.0679) Grad: 0.4932  
Epoch: [1][7780/51233] Data 0.308 (0.311) Elapsed 69m 9s (remain 386m 13s) Loss: 0.1455(0.0679) Grad: 1.5984  
Epoch: [1][7800/51233] Data 0.309 (0.311) Elapsed 69m 20s (remain 386m 2s) Loss: 0.0093(0.0679) Grad: 0.1373  
Epoch: [1][7820/51233] Data 0.308 (0.311) Elapsed 69m 31s (remain 385m 52s) Loss: 0.0082(0.0678) Grad: 0.1091  
Epoch: [1][7840/51233] Data 0.307 (0.311) Elapsed 69m 41s (remain 385m 41s) Loss: 0.0066(0.0678) Grad: 0.1258  
Epoch: [1][7860/51233] Data 0.308 (0.311) Elapsed 69m 52s (remain 385m 30s) Loss: 0.1071(0.0678) Grad: 1.2459  
Epoch: [1][7880/51233] Data 0.306 (0.311) Elapsed 70m 3s (remain 385m 20s) Loss: 0.0058(0.0679) Grad: 0.1483  
Epoch: [1][7900/51233] Data 0.317 (0.311) Elapsed 70m 13s (remain 385m 9s) Loss: 0.0377(0.0678) Grad: 0.2674  
Epoch: [1][7920/51233] Data 0.303 (0.311) Elapsed 70m 24s (remain 384m 58s) Loss: 0.1526(0.0678) Grad: 1.8533  
Epoch: [1][7940/51233] Data 0.304 (0.311) Elapsed 70m 35s (remain 384m 48s) Loss: 0.0093(0.0677) Grad: 0.1225  
Epoch: [1][7960/51233] Data 0.311 (0.311) Elapsed 70m 45s (remain 384m 37s) Loss: 0.0145(0.0677) Grad: 0.4425  
Epoch: [1][7980/51233] Data 0.308 (0.311) Elapsed 70m 56s (remain 384m 26s) Loss: 0.0272(0.0677) Grad: 0.7213  
Epoch: [1][8000/51233] Data 0.316 (0.311) Elapsed 71m 6s (remain 384m 15s) Loss: 0.0063(0.0678) Grad: 0.0815  
Epoch: [1][8020/51233] Data 0.309 (0.311) Elapsed 71m 17s (remain 384m 5s) Loss: 0.0766(0.0678) Grad: 0.5214  
Epoch: [1][8040/51233] Data 0.311 (0.311) Elapsed 71m 28s (remain 383m 54s) Loss: 0.3169(0.0678) Grad: 1.9094  
Epoch: [1][8060/51233] Data 0.320 (0.311) Elapsed 71m 38s (remain 383m 43s) Loss: 0.0058(0.0678) Grad: 0.0588  
Epoch: [1][8080/51233] Data 0.317 (0.311) Elapsed 71m 49s (remain 383m 33s) Loss: 0.0543(0.0677) Grad: 1.2330  
Epoch: [1][8100/51233] Data 0.300 (0.311) Elapsed 72m 0s (remain 383m 22s) Loss: 0.0264(0.0678) Grad: 0.3994  
Epoch: [1][8120/51233] Data 0.295 (0.311) Elapsed 72m 10s (remain 383m 11s) Loss: 0.1287(0.0679) Grad: 1.5866  
Epoch: [1][8140/51233] Data 0.303 (0.311) Elapsed 72m 21s (remain 383m 1s) Loss: 0.0953(0.0679) Grad: 1.1911  
Epoch: [1][8160/51233] Data 0.308 (0.311) Elapsed 72m 32s (remain 382m 50s) Loss: 0.0284(0.0678) Grad: 0.4800  
Epoch: [1][8180/51233] Data 0.317 (0.311) Elapsed 72m 42s (remain 382m 39s) Loss: 0.0188(0.0678) Grad: 0.2833  
Epoch: [1][8200/51233] Data 0.314 (0.311) Elapsed 72m 53s (remain 382m 28s) Loss: 0.1118(0.0678) Grad: 1.4703  
Epoch: [1][8220/51233] Data 0.312 (0.311) Elapsed 73m 4s (remain 382m 18s) Loss: 0.0108(0.0678) Grad: 0.1753  
Epoch: [1][8240/51233] Data 0.313 (0.311) Elapsed 73m 14s (remain 382m 7s) Loss: 0.2095(0.0677) Grad: 1.6944  
Epoch: [1][8260/51233] Data 0.307 (0.311) Elapsed 73m 25s (remain 381m 56s) Loss: 0.0365(0.0677) Grad: 0.5911  
Epoch: [1][8280/51233] Data 0.316 (0.311) Elapsed 73m 36s (remain 381m 45s) Loss: 0.0844(0.0677) Grad: 0.9378  
Epoch: [1][8300/51233] Data 0.310 (0.311) Elapsed 73m 46s (remain 381m 35s) Loss: 0.0346(0.0678) Grad: 0.9139  
Epoch: [1][8320/51233] Data 0.310 (0.311) Elapsed 73m 57s (remain 381m 24s) Loss: 0.0072(0.0678) Grad: 0.1057  
Epoch: [1][8340/51233] Data 0.310 (0.311) Elapsed 74m 8s (remain 381m 13s) Loss: 0.2824(0.0678) Grad: 1.4066  
Epoch: [1][8360/51233] Data 0.308 (0.311) Elapsed 74m 18s (remain 381m 3s) Loss: 0.0017(0.0678) Grad: 0.0192  
Epoch: [1][8380/51233] Data 0.317 (0.311) Elapsed 74m 29s (remain 380m 52s) Loss: 0.0038(0.0679) Grad: 0.0419  
Epoch: [1][8400/51233] Data 0.318 (0.311) Elapsed 74m 40s (remain 380m 41s) Loss: 0.0280(0.0679) Grad: 0.5973  
Epoch: [1][8420/51233] Data 0.313 (0.311) Elapsed 74m 50s (remain 380m 31s) Loss: 0.0188(0.0678) Grad: 0.4044  
Epoch: [1][8440/51233] Data 0.317 (0.311) Elapsed 75m 1s (remain 380m 20s) Loss: 0.0026(0.0678) Grad: 0.0388  
Epoch: [1][8460/51233] Data 0.316 (0.311) Elapsed 75m 12s (remain 380m 9s) Loss: 0.0057(0.0678) Grad: 0.0984  
Epoch: [1][8480/51233] Data 0.313 (0.311) Elapsed 75m 22s (remain 379m 58s) Loss: 0.0065(0.0677) Grad: 0.1282  
Epoch: [1][8500/51233] Data 0.318 (0.311) Elapsed 75m 33s (remain 379m 48s) Loss: 0.3365(0.0678) Grad: 4.2881  
Epoch: [1][8520/51233] Data 0.310 (0.311) Elapsed 75m 44s (remain 379m 37s) Loss: 0.0208(0.0678) Grad: 0.2336  
Epoch: [1][8540/51233] Data 0.318 (0.311) Elapsed 75m 54s (remain 379m 26s) Loss: 0.0109(0.0678) Grad: 0.1249  
Epoch: [1][8560/51233] Data 0.306 (0.311) Elapsed 76m 5s (remain 379m 16s) Loss: 0.0210(0.0678) Grad: 0.3852  
Epoch: [1][8580/51233] Data 0.318 (0.311) Elapsed 76m 16s (remain 379m 5s) Loss: 0.0212(0.0678) Grad: 0.3207  
Epoch: [1][8600/51233] Data 0.299 (0.311) Elapsed 76m 26s (remain 378m 54s) Loss: 0.0095(0.0679) Grad: 0.1836  
Epoch: [1][8620/51233] Data 0.307 (0.311) Elapsed 76m 37s (remain 378m 44s) Loss: 0.0119(0.0679) Grad: 0.1987  
Epoch: [1][8640/51233] Data 0.306 (0.311) Elapsed 76m 48s (remain 378m 33s) Loss: 0.0146(0.0678) Grad: 0.3515  
Epoch: [1][8660/51233] Data 0.316 (0.311) Elapsed 76m 58s (remain 378m 22s) Loss: 0.1545(0.0679) Grad: 1.8661  
Epoch: [1][8680/51233] Data 0.305 (0.311) Elapsed 77m 9s (remain 378m 11s) Loss: 0.1525(0.0679) Grad: 1.5932  
Epoch: [1][8700/51233] Data 0.313 (0.311) Elapsed 77m 20s (remain 378m 1s) Loss: 0.0941(0.0679) Grad: 1.7106  
Epoch: [1][8720/51233] Data 0.311 (0.311) Elapsed 77m 30s (remain 377m 50s) Loss: 0.0245(0.0678) Grad: 0.5057  
Epoch: [1][8740/51233] Data 0.317 (0.311) Elapsed 77m 41s (remain 377m 39s) Loss: 0.2214(0.0678) Grad: 1.7551  
Epoch: [1][8760/51233] Data 0.308 (0.311) Elapsed 77m 51s (remain 377m 29s) Loss: 0.0247(0.0678) Grad: 0.4082  
Epoch: [1][8780/51233] Data 0.310 (0.311) Elapsed 78m 2s (remain 377m 18s) Loss: 0.0116(0.0677) Grad: 0.1643  
Epoch: [1][8800/51233] Data 0.319 (0.311) Elapsed 78m 13s (remain 377m 7s) Loss: 0.0077(0.0677) Grad: 0.1164  
Epoch: [1][8820/51233] Data 0.310 (0.311) Elapsed 78m 23s (remain 376m 56s) Loss: 0.2360(0.0677) Grad: 2.0205  
Epoch: [1][8840/51233] Data 0.308 (0.311) Elapsed 78m 34s (remain 376m 46s) Loss: 0.0363(0.0676) Grad: 0.7018  
Epoch: [1][8860/51233] Data 0.306 (0.311) Elapsed 78m 45s (remain 376m 35s) Loss: 0.1299(0.0677) Grad: 2.7262  
Epoch: [1][8880/51233] Data 0.309 (0.311) Elapsed 78m 55s (remain 376m 24s) Loss: 0.0435(0.0676) Grad: 0.9852  
Epoch: [1][8900/51233] Data 0.318 (0.311) Elapsed 79m 6s (remain 376m 14s) Loss: 0.2734(0.0676) Grad: 1.6442  
Epoch: [1][8920/51233] Data 0.305 (0.311) Elapsed 79m 17s (remain 376m 3s) Loss: 0.0276(0.0677) Grad: 0.6345  
Epoch: [1][8940/51233] Data 0.307 (0.311) Elapsed 79m 27s (remain 375m 52s) Loss: 0.0001(0.0676) Grad: 0.0015  
Epoch: [1][8960/51233] Data 0.293 (0.311) Elapsed 79m 38s (remain 375m 42s) Loss: 0.0174(0.0676) Grad: 0.2940  
Epoch: [1][8980/51233] Data 0.316 (0.311) Elapsed 79m 49s (remain 375m 31s) Loss: 0.0806(0.0676) Grad: 1.3808  
Epoch: [1][9000/51233] Data 0.307 (0.311) Elapsed 79m 59s (remain 375m 20s) Loss: 0.0032(0.0676) Grad: 0.0416  
Epoch: [1][9020/51233] Data 0.318 (0.311) Elapsed 80m 10s (remain 375m 9s) Loss: 0.0324(0.0675) Grad: 0.9453  
Epoch: [1][9040/51233] Data 0.315 (0.311) Elapsed 80m 21s (remain 374m 59s) Loss: 0.0310(0.0676) Grad: 1.0044  
Epoch: [1][9060/51233] Data 0.294 (0.311) Elapsed 80m 31s (remain 374m 48s) Loss: 0.0014(0.0676) Grad: 0.0269  
Epoch: [1][9080/51233] Data 0.297 (0.311) Elapsed 80m 42s (remain 374m 37s) Loss: 0.0967(0.0676) Grad: 1.5862  
Epoch: [1][9100/51233] Data 0.310 (0.311) Elapsed 80m 53s (remain 374m 27s) Loss: 0.0009(0.0676) Grad: 0.0090  
Epoch: [1][9120/51233] Data 0.318 (0.311) Elapsed 81m 3s (remain 374m 16s) Loss: 0.0074(0.0676) Grad: 0.1813  
Epoch: [1][9140/51233] Data 0.315 (0.311) Elapsed 81m 14s (remain 374m 5s) Loss: 0.0396(0.0675) Grad: 0.6161  
Epoch: [1][9160/51233] Data 0.300 (0.311) Elapsed 81m 25s (remain 373m 55s) Loss: 0.0082(0.0675) Grad: 0.1001  
Epoch: [1][9180/51233] Data 0.300 (0.311) Elapsed 81m 35s (remain 373m 44s) Loss: 0.0329(0.0675) Grad: 0.5592  
Epoch: [1][9200/51233] Data 0.309 (0.311) Elapsed 81m 46s (remain 373m 33s) Loss: 0.0130(0.0675) Grad: 0.2439  
Epoch: [1][9220/51233] Data 0.304 (0.311) Elapsed 81m 57s (remain 373m 23s) Loss: 0.0017(0.0674) Grad: 0.0176  
Epoch: [1][9240/51233] Data 0.307 (0.311) Elapsed 82m 7s (remain 373m 12s) Loss: 0.0720(0.0674) Grad: 1.0349  
Epoch: [1][9260/51233] Data 0.278 (0.311) Elapsed 82m 18s (remain 373m 1s) Loss: 0.0043(0.0673) Grad: 0.0721  
Epoch: [1][9280/51233] Data 0.314 (0.311) Elapsed 82m 29s (remain 372m 50s) Loss: 0.0036(0.0673) Grad: 0.0414  
Epoch: [1][9300/51233] Data 0.315 (0.311) Elapsed 82m 39s (remain 372m 40s) Loss: 0.0072(0.0673) Grad: 0.1058  
Epoch: [1][9320/51233] Data 0.296 (0.311) Elapsed 82m 50s (remain 372m 29s) Loss: 0.0642(0.0673) Grad: 0.9144  
Epoch: [1][9340/51233] Data 0.310 (0.311) Elapsed 83m 1s (remain 372m 18s) Loss: 0.0492(0.0674) Grad: 0.6783  
Epoch: [1][9360/51233] Data 0.310 (0.311) Elapsed 83m 11s (remain 372m 8s) Loss: 0.0280(0.0673) Grad: 0.6754  
Epoch: [1][9380/51233] Data 0.311 (0.311) Elapsed 83m 22s (remain 371m 57s) Loss: 0.0102(0.0674) Grad: 0.1495  
Epoch: [1][9400/51233] Data 0.317 (0.311) Elapsed 83m 33s (remain 371m 46s) Loss: 0.1872(0.0674) Grad: 1.8617  
Epoch: [1][9420/51233] Data 0.310 (0.311) Elapsed 83m 43s (remain 371m 36s) Loss: 0.0351(0.0674) Grad: 0.4663  
Epoch: [1][9440/51233] Data 0.306 (0.311) Elapsed 83m 54s (remain 371m 25s) Loss: 0.0346(0.0674) Grad: 0.4876  
Epoch: [1][9460/51233] Data 0.317 (0.311) Elapsed 84m 5s (remain 371m 14s) Loss: 0.0033(0.0674) Grad: 0.0527  
Epoch: [1][9480/51233] Data 0.317 (0.311) Elapsed 84m 15s (remain 371m 3s) Loss: 0.1405(0.0674) Grad: 1.0133  
Epoch: [1][9500/51233] Data 0.310 (0.311) Elapsed 84m 26s (remain 370m 53s) Loss: 0.0370(0.0674) Grad: 0.8120  
Epoch: [1][9520/51233] Data 0.318 (0.311) Elapsed 84m 36s (remain 370m 42s) Loss: 0.0492(0.0674) Grad: 1.0166  
Epoch: [1][9540/51233] Data 0.317 (0.311) Elapsed 84m 47s (remain 370m 31s) Loss: 0.0465(0.0674) Grad: 0.9942  
Epoch: [1][9560/51233] Data 0.310 (0.311) Elapsed 84m 58s (remain 370m 21s) Loss: 0.0125(0.0674) Grad: 0.1458  
Epoch: [1][9580/51233] Data 0.309 (0.311) Elapsed 85m 8s (remain 370m 10s) Loss: 0.0208(0.0674) Grad: 0.4415  
Epoch: [1][9600/51233] Data 0.309 (0.311) Elapsed 85m 19s (remain 369m 59s) Loss: 0.0310(0.0674) Grad: 0.5374  
Epoch: [1][9620/51233] Data 0.308 (0.311) Elapsed 85m 30s (remain 369m 49s) Loss: 0.0135(0.0674) Grad: 0.1779  
Epoch: [1][9640/51233] Data 0.307 (0.311) Elapsed 85m 40s (remain 369m 38s) Loss: 0.0149(0.0673) Grad: 0.3462  
Epoch: [1][9660/51233] Data 0.318 (0.311) Elapsed 85m 51s (remain 369m 27s) Loss: 0.1772(0.0673) Grad: 1.3975  
Epoch: [1][9680/51233] Data 0.311 (0.311) Elapsed 86m 2s (remain 369m 17s) Loss: 0.0042(0.0672) Grad: 0.0584  
Epoch: [1][9700/51233] Data 0.317 (0.311) Elapsed 86m 12s (remain 369m 6s) Loss: 0.1040(0.0672) Grad: 0.9176  
Epoch: [1][9720/51233] Data 0.317 (0.311) Elapsed 86m 23s (remain 368m 55s) Loss: 0.0013(0.0672) Grad: 0.0131  
Epoch: [1][9740/51233] Data 0.307 (0.311) Elapsed 86m 34s (remain 368m 44s) Loss: 0.0742(0.0672) Grad: 1.6816  
Epoch: [1][9760/51233] Data 0.314 (0.311) Elapsed 86m 44s (remain 368m 34s) Loss: 0.0666(0.0672) Grad: 1.0331  
Epoch: [1][9780/51233] Data 0.307 (0.311) Elapsed 86m 55s (remain 368m 23s) Loss: 0.0120(0.0673) Grad: 0.2495  
Epoch: [1][9800/51233] Data 0.311 (0.311) Elapsed 87m 6s (remain 368m 12s) Loss: 0.0375(0.0673) Grad: 0.4998  
Epoch: [1][9820/51233] Data 0.318 (0.311) Elapsed 87m 16s (remain 368m 2s) Loss: 0.0424(0.0673) Grad: 0.4679  
Epoch: [1][9840/51233] Data 0.310 (0.311) Elapsed 87m 27s (remain 367m 51s) Loss: 0.3009(0.0673) Grad: 1.2877  
Epoch: [1][9860/51233] Data 0.307 (0.311) Elapsed 87m 38s (remain 367m 40s) Loss: 0.0377(0.0673) Grad: 0.6563  
Epoch: [1][9880/51233] Data 0.318 (0.311) Elapsed 87m 48s (remain 367m 29s) Loss: 0.2183(0.0672) Grad: 1.5889  
Epoch: [1][9900/51233] Data 0.317 (0.311) Elapsed 87m 59s (remain 367m 19s) Loss: 0.0087(0.0672) Grad: 0.1165  
Epoch: [1][9920/51233] Data 0.309 (0.311) Elapsed 88m 10s (remain 367m 8s) Loss: 0.2473(0.0672) Grad: 1.2978  
Epoch: [1][9940/51233] Data 0.308 (0.311) Elapsed 88m 20s (remain 366m 57s) Loss: 0.0247(0.0672) Grad: 0.2714  
Epoch: [1][9960/51233] Data 0.307 (0.311) Elapsed 88m 31s (remain 366m 47s) Loss: 0.0200(0.0673) Grad: 0.2916  
Epoch: [1][9980/51233] Data 0.309 (0.311) Elapsed 88m 42s (remain 366m 36s) Loss: 0.0236(0.0673) Grad: 0.4055  
Epoch: [1][10000/51233] Data 0.318 (0.311) Elapsed 88m 52s (remain 366m 25s) Loss: 0.1659(0.0673) Grad: 1.4448  
Epoch: [1][10020/51233] Data 0.301 (0.311) Elapsed 89m 3s (remain 366m 15s) Loss: 0.0054(0.0673) Grad: 0.1245  
Epoch: [1][10040/51233] Data 0.318 (0.311) Elapsed 89m 14s (remain 366m 4s) Loss: 0.0245(0.0672) Grad: 0.3313  
Epoch: [1][10060/51233] Data 0.317 (0.311) Elapsed 89m 24s (remain 365m 53s) Loss: 0.0242(0.0672) Grad: 0.5514  
Epoch: [1][10080/51233] Data 0.317 (0.311) Elapsed 89m 35s (remain 365m 42s) Loss: 0.0349(0.0673) Grad: 0.5172  
Epoch: [1][10100/51233] Data 0.297 (0.311) Elapsed 89m 46s (remain 365m 32s) Loss: 0.0342(0.0673) Grad: 0.7961  
Epoch: [1][10120/51233] Data 0.316 (0.311) Elapsed 89m 56s (remain 365m 21s) Loss: 0.1260(0.0673) Grad: 1.2558  
Epoch: [1][10140/51233] Data 0.311 (0.311) Elapsed 90m 7s (remain 365m 10s) Loss: 0.0194(0.0673) Grad: 0.4448  
Epoch: [1][10160/51233] Data 0.312 (0.311) Elapsed 90m 18s (remain 365m 0s) Loss: 0.0105(0.0672) Grad: 0.1911  
Epoch: [1][10180/51233] Data 0.318 (0.311) Elapsed 90m 28s (remain 364m 49s) Loss: 0.0162(0.0672) Grad: 0.2581  
Epoch: [1][10200/51233] Data 0.318 (0.311) Elapsed 90m 39s (remain 364m 38s) Loss: 0.0655(0.0672) Grad: 0.9514  
Epoch: [1][10220/51233] Data 0.309 (0.311) Elapsed 90m 49s (remain 364m 28s) Loss: 0.2258(0.0672) Grad: 1.7982  
Epoch: [1][10240/51233] Data 0.318 (0.311) Elapsed 91m 0s (remain 364m 17s) Loss: 0.0072(0.0672) Grad: 0.1168  
Epoch: [1][10260/51233] Data 0.319 (0.311) Elapsed 91m 11s (remain 364m 6s) Loss: 0.0146(0.0672) Grad: 0.2047  
Epoch: [1][10280/51233] Data 0.306 (0.311) Elapsed 91m 21s (remain 363m 56s) Loss: 0.0025(0.0672) Grad: 0.0446  
Epoch: [1][10300/51233] Data 0.307 (0.311) Elapsed 91m 32s (remain 363m 45s) Loss: 0.0235(0.0671) Grad: 0.4431  
Epoch: [1][10320/51233] Data 0.318 (0.311) Elapsed 91m 43s (remain 363m 34s) Loss: 0.0496(0.0671) Grad: 0.9247  
Epoch: [1][10340/51233] Data 0.310 (0.311) Elapsed 91m 53s (remain 363m 24s) Loss: 0.1330(0.0671) Grad: 2.0595  
Epoch: [1][10360/51233] Data 0.318 (0.311) Elapsed 92m 4s (remain 363m 13s) Loss: 0.0174(0.0671) Grad: 0.3442  
Epoch: [1][10380/51233] Data 0.311 (0.311) Elapsed 92m 15s (remain 363m 2s) Loss: 0.3254(0.0671) Grad: 1.7495  
Epoch: [1][10400/51233] Data 0.296 (0.311) Elapsed 92m 25s (remain 362m 51s) Loss: 0.0282(0.0672) Grad: 0.9083  
Epoch: [1][10420/51233] Data 0.307 (0.311) Elapsed 92m 36s (remain 362m 41s) Loss: 0.4551(0.0672) Grad: 2.3860  
Epoch: [1][10440/51233] Data 0.314 (0.311) Elapsed 92m 47s (remain 362m 30s) Loss: 0.0122(0.0671) Grad: 0.2558  
Epoch: [1][10460/51233] Data 0.310 (0.311) Elapsed 92m 57s (remain 362m 19s) Loss: 0.0161(0.0671) Grad: 0.2060  
Epoch: [1][10480/51233] Data 0.318 (0.311) Elapsed 93m 8s (remain 362m 9s) Loss: 0.0035(0.0671) Grad: 0.0938  
Epoch: [1][10500/51233] Data 0.302 (0.311) Elapsed 93m 19s (remain 361m 58s) Loss: 0.0089(0.0671) Grad: 0.1463  
Epoch: [1][10520/51233] Data 0.309 (0.311) Elapsed 93m 29s (remain 361m 47s) Loss: 0.0052(0.0670) Grad: 0.0730  
Epoch: [1][10540/51233] Data 0.310 (0.311) Elapsed 93m 40s (remain 361m 36s) Loss: 0.0947(0.0670) Grad: 1.0334  
Epoch: [1][10560/51233] Data 0.311 (0.311) Elapsed 93m 51s (remain 361m 26s) Loss: 0.0048(0.0670) Grad: 0.1521  
Epoch: [1][10580/51233] Data 0.306 (0.311) Elapsed 94m 1s (remain 361m 15s) Loss: 0.0361(0.0670) Grad: 0.6139  
Epoch: [1][10600/51233] Data 0.308 (0.311) Elapsed 94m 12s (remain 361m 4s) Loss: 0.0834(0.0670) Grad: 1.5529  
Epoch: [1][10620/51233] Data 0.312 (0.311) Elapsed 94m 23s (remain 360m 54s) Loss: 0.0092(0.0670) Grad: 0.1731  
Epoch: [1][10640/51233] Data 0.313 (0.311) Elapsed 94m 33s (remain 360m 43s) Loss: 0.0042(0.0669) Grad: 0.0987  
Epoch: [1][10660/51233] Data 0.318 (0.311) Elapsed 94m 44s (remain 360m 32s) Loss: 0.1309(0.0670) Grad: 1.7813  
Epoch: [1][10680/51233] Data 0.307 (0.311) Elapsed 94m 55s (remain 360m 22s) Loss: 0.0056(0.0670) Grad: 0.0824  
Epoch: [1][10700/51233] Data 0.314 (0.311) Elapsed 95m 5s (remain 360m 11s) Loss: 0.0019(0.0669) Grad: 0.0251  
Epoch: [1][10720/51233] Data 0.317 (0.311) Elapsed 95m 16s (remain 360m 0s) Loss: 0.0381(0.0670) Grad: 0.4489  
Epoch: [1][10740/51233] Data 0.310 (0.311) Elapsed 95m 27s (remain 359m 50s) Loss: 0.0374(0.0670) Grad: 0.5762  
Epoch: [1][10760/51233] Data 0.290 (0.311) Elapsed 95m 37s (remain 359m 39s) Loss: 0.0938(0.0670) Grad: 1.2678  
Epoch: [1][10780/51233] Data 0.310 (0.311) Elapsed 95m 48s (remain 359m 28s) Loss: 0.0101(0.0669) Grad: 0.1599  
Epoch: [1][10800/51233] Data 0.310 (0.311) Elapsed 95m 58s (remain 359m 17s) Loss: 0.0537(0.0669) Grad: 1.1813  
Epoch: [1][10820/51233] Data 0.310 (0.311) Elapsed 96m 9s (remain 359m 7s) Loss: 0.0756(0.0668) Grad: 1.3051  
Epoch: [1][10840/51233] Data 0.318 (0.311) Elapsed 96m 20s (remain 358m 56s) Loss: 0.0203(0.0668) Grad: 0.4465  
Epoch: [1][10860/51233] Data 0.317 (0.311) Elapsed 96m 30s (remain 358m 45s) Loss: 0.0465(0.0668) Grad: 0.5200  
Epoch: [1][10880/51233] Data 0.313 (0.311) Elapsed 96m 41s (remain 358m 35s) Loss: 0.0120(0.0667) Grad: 0.1690  
Epoch: [1][10900/51233] Data 0.312 (0.311) Elapsed 96m 52s (remain 358m 24s) Loss: 0.0832(0.0667) Grad: 1.3185  
Epoch: [1][10920/51233] Data 0.317 (0.311) Elapsed 97m 2s (remain 358m 13s) Loss: 0.0008(0.0666) Grad: 0.0171  
Epoch: [1][10940/51233] Data 0.318 (0.311) Elapsed 97m 13s (remain 358m 3s) Loss: 0.0041(0.0665) Grad: 0.0750  
Epoch: [1][10960/51233] Data 0.318 (0.311) Elapsed 97m 24s (remain 357m 52s) Loss: 0.0013(0.0665) Grad: 0.0244  
Epoch: [1][10980/51233] Data 0.311 (0.311) Elapsed 97m 34s (remain 357m 41s) Loss: 0.1260(0.0666) Grad: 1.5265  
Epoch: [1][11000/51233] Data 0.315 (0.311) Elapsed 97m 45s (remain 357m 31s) Loss: 0.0167(0.0666) Grad: 0.3088  
Epoch: [1][11020/51233] Data 0.317 (0.311) Elapsed 97m 56s (remain 357m 20s) Loss: 0.0470(0.0665) Grad: 0.6579  
Epoch: [1][11040/51233] Data 0.305 (0.311) Elapsed 98m 6s (remain 357m 9s) Loss: 0.0460(0.0665) Grad: 0.9881  
Epoch: [1][11060/51233] Data 0.311 (0.311) Elapsed 98m 17s (remain 356m 59s) Loss: 0.0012(0.0664) Grad: 0.0298  
Epoch: [1][11080/51233] Data 0.318 (0.311) Elapsed 98m 28s (remain 356m 48s) Loss: 0.2865(0.0664) Grad: 1.7304  
Epoch: [1][11100/51233] Data 0.308 (0.311) Elapsed 98m 38s (remain 356m 37s) Loss: 0.0054(0.0663) Grad: 0.0781  
Epoch: [1][11120/51233] Data 0.316 (0.311) Elapsed 98m 49s (remain 356m 26s) Loss: 0.3249(0.0663) Grad: 1.1511  
Epoch: [1][11140/51233] Data 0.305 (0.311) Elapsed 99m 0s (remain 356m 16s) Loss: 0.0046(0.0663) Grad: 0.0802  
Epoch: [1][11160/51233] Data 0.309 (0.311) Elapsed 99m 10s (remain 356m 5s) Loss: 0.0454(0.0663) Grad: 1.2493  
Epoch: [1][11180/51233] Data 0.320 (0.311) Elapsed 99m 21s (remain 355m 54s) Loss: 0.0232(0.0663) Grad: 0.3217  
Epoch: [1][11200/51233] Data 0.295 (0.311) Elapsed 99m 32s (remain 355m 44s) Loss: 0.0059(0.0663) Grad: 0.1195  
Epoch: [1][11220/51233] Data 0.308 (0.311) Elapsed 99m 42s (remain 355m 33s) Loss: 0.0090(0.0663) Grad: 0.1945  
Epoch: [1][11240/51233] Data 0.316 (0.311) Elapsed 99m 53s (remain 355m 22s) Loss: 0.0087(0.0662) Grad: 0.1302  
Epoch: [1][11260/51233] Data 0.317 (0.311) Elapsed 100m 4s (remain 355m 12s) Loss: 0.0837(0.0662) Grad: 1.1078  
Epoch: [1][11280/51233] Data 0.309 (0.311) Elapsed 100m 14s (remain 355m 1s) Loss: 0.0705(0.0662) Grad: 0.9827  
Epoch: [1][11300/51233] Data 0.310 (0.311) Elapsed 100m 25s (remain 354m 50s) Loss: 0.0020(0.0662) Grad: 0.0247  
Epoch: [1][11320/51233] Data 0.317 (0.311) Elapsed 100m 36s (remain 354m 40s) Loss: 0.0875(0.0662) Grad: 1.5792  
Epoch: [1][11340/51233] Data 0.299 (0.311) Elapsed 100m 46s (remain 354m 29s) Loss: 0.1411(0.0662) Grad: 2.2105  
Epoch: [1][11360/51233] Data 0.303 (0.311) Elapsed 100m 57s (remain 354m 18s) Loss: 0.0131(0.0662) Grad: 0.3960  
Epoch: [1][11380/51233] Data 0.308 (0.311) Elapsed 101m 8s (remain 354m 7s) Loss: 0.0037(0.0662) Grad: 0.0688  
Epoch: [1][11400/51233] Data 0.306 (0.311) Elapsed 101m 18s (remain 353m 57s) Loss: 0.0013(0.0662) Grad: 0.0144  
Epoch: [1][11420/51233] Data 0.314 (0.311) Elapsed 101m 29s (remain 353m 46s) Loss: 0.0241(0.0661) Grad: 0.3901  
Epoch: [1][11440/51233] Data 0.305 (0.311) Elapsed 101m 40s (remain 353m 35s) Loss: 0.0223(0.0662) Grad: 0.4184  
Epoch: [1][11460/51233] Data 0.316 (0.311) Elapsed 101m 50s (remain 353m 25s) Loss: 0.0691(0.0662) Grad: 1.0132  
Epoch: [1][11480/51233] Data 0.318 (0.311) Elapsed 102m 1s (remain 353m 14s) Loss: 0.0651(0.0661) Grad: 1.2133  
Epoch: [1][11500/51233] Data 0.306 (0.311) Elapsed 102m 11s (remain 353m 3s) Loss: 0.0016(0.0661) Grad: 0.0446  
Epoch: [1][11520/51233] Data 0.308 (0.311) Elapsed 102m 22s (remain 352m 53s) Loss: 0.0169(0.0661) Grad: 0.5885  
Epoch: [1][11540/51233] Data 0.310 (0.311) Elapsed 102m 33s (remain 352m 42s) Loss: 0.0072(0.0661) Grad: 0.1690  
Epoch: [1][11560/51233] Data 0.312 (0.311) Elapsed 102m 43s (remain 352m 31s) Loss: 0.0058(0.0661) Grad: 0.0713  
Epoch: [1][11580/51233] Data 0.310 (0.311) Elapsed 102m 54s (remain 352m 21s) Loss: 0.0063(0.0661) Grad: 0.0769  
Epoch: [1][11600/51233] Data 0.308 (0.311) Elapsed 103m 5s (remain 352m 10s) Loss: 0.0060(0.0661) Grad: 0.0875  
Epoch: [1][11620/51233] Data 0.317 (0.311) Elapsed 103m 15s (remain 351m 59s) Loss: 0.0821(0.0661) Grad: 1.2340  
Epoch: [1][11640/51233] Data 0.316 (0.311) Elapsed 103m 26s (remain 351m 49s) Loss: 0.1050(0.0661) Grad: 1.3353  
Epoch: [1][11660/51233] Data 0.308 (0.311) Elapsed 103m 37s (remain 351m 38s) Loss: 0.0064(0.0660) Grad: 0.0955  
Epoch: [1][11680/51233] Data 0.314 (0.311) Elapsed 103m 47s (remain 351m 27s) Loss: 0.0015(0.0660) Grad: 0.0255  
Epoch: [1][11700/51233] Data 0.309 (0.311) Elapsed 103m 58s (remain 351m 17s) Loss: 0.0116(0.0660) Grad: 0.2113  
Epoch: [1][11720/51233] Data 0.309 (0.311) Elapsed 104m 9s (remain 351m 6s) Loss: 0.1386(0.0659) Grad: 1.4592  
Epoch: [1][11740/51233] Data 0.308 (0.311) Elapsed 104m 19s (remain 350m 55s) Loss: 0.0240(0.0659) Grad: 0.5503  
Epoch: [1][11760/51233] Data 0.309 (0.311) Elapsed 104m 30s (remain 350m 44s) Loss: 0.0406(0.0659) Grad: 0.8913  
Epoch: [1][11780/51233] Data 0.298 (0.311) Elapsed 104m 41s (remain 350m 34s) Loss: 0.0809(0.0659) Grad: 0.7625  
Epoch: [1][11800/51233] Data 0.302 (0.311) Elapsed 104m 51s (remain 350m 23s) Loss: 0.0929(0.0659) Grad: 0.9948  
Epoch: [1][11820/51233] Data 0.318 (0.311) Elapsed 105m 2s (remain 350m 12s) Loss: 0.1483(0.0660) Grad: 1.4541  
Epoch: [1][11840/51233] Data 0.316 (0.311) Elapsed 105m 13s (remain 350m 2s) Loss: 0.0014(0.0659) Grad: 0.0190  
Epoch: [1][11860/51233] Data 0.317 (0.311) Elapsed 105m 23s (remain 349m 51s) Loss: 0.1498(0.0659) Grad: 2.6290  
Epoch: [1][11880/51233] Data 0.310 (0.311) Elapsed 105m 34s (remain 349m 40s) Loss: 0.0039(0.0659) Grad: 0.0691  
Epoch: [1][11900/51233] Data 0.317 (0.311) Elapsed 105m 45s (remain 349m 30s) Loss: 0.0018(0.0659) Grad: 0.0461  
Epoch: [1][11920/51233] Data 0.306 (0.311) Elapsed 105m 55s (remain 349m 19s) Loss: 0.0018(0.0658) Grad: 0.0248  
Epoch: [1][11940/51233] Data 0.318 (0.311) Elapsed 106m 6s (remain 349m 8s) Loss: 0.0664(0.0658) Grad: 1.1070  
Epoch: [1][11960/51233] Data 0.309 (0.311) Elapsed 106m 17s (remain 348m 58s) Loss: 0.0725(0.0659) Grad: 1.4006  
Epoch: [1][11980/51233] Data 0.317 (0.311) Elapsed 106m 27s (remain 348m 47s) Loss: 0.2591(0.0659) Grad: 1.5522  
Epoch: [1][12000/51233] Data 0.307 (0.311) Elapsed 106m 38s (remain 348m 36s) Loss: 0.1049(0.0659) Grad: 1.0224  
Epoch: [1][12020/51233] Data 0.317 (0.311) Elapsed 106m 49s (remain 348m 26s) Loss: 0.0134(0.0659) Grad: 0.2114  
Epoch: [1][12040/51233] Data 0.314 (0.311) Elapsed 106m 59s (remain 348m 15s) Loss: 0.2074(0.0659) Grad: 1.2179  
Epoch: [1][12060/51233] Data 0.318 (0.311) Elapsed 107m 10s (remain 348m 4s) Loss: 0.0573(0.0659) Grad: 0.7789  
Epoch: [1][12080/51233] Data 0.310 (0.311) Elapsed 107m 21s (remain 347m 53s) Loss: 0.0189(0.0659) Grad: 0.4823  
Epoch: [1][12100/51233] Data 0.297 (0.311) Elapsed 107m 31s (remain 347m 43s) Loss: 0.0043(0.0659) Grad: 0.0896  
Epoch: [1][12120/51233] Data 0.318 (0.311) Elapsed 107m 42s (remain 347m 32s) Loss: 0.0244(0.0659) Grad: 0.4731  
Epoch: [1][12140/51233] Data 0.316 (0.311) Elapsed 107m 52s (remain 347m 21s) Loss: 0.0851(0.0659) Grad: 1.2862  
Epoch: [1][12160/51233] Data 0.318 (0.311) Elapsed 108m 3s (remain 347m 11s) Loss: 0.0234(0.0658) Grad: 0.4114  
Epoch: [1][12180/51233] Data 0.313 (0.311) Elapsed 108m 14s (remain 347m 0s) Loss: 0.0627(0.0658) Grad: 1.7361  
Epoch: [1][12200/51233] Data 0.310 (0.311) Elapsed 108m 24s (remain 346m 49s) Loss: 0.2481(0.0658) Grad: 1.5754  
Epoch: [1][12220/51233] Data 0.318 (0.311) Elapsed 108m 35s (remain 346m 39s) Loss: 0.0033(0.0659) Grad: 0.0475  
Epoch: [1][12240/51233] Data 0.309 (0.311) Elapsed 108m 46s (remain 346m 28s) Loss: 0.0626(0.0659) Grad: 1.1367  
Epoch: [1][12260/51233] Data 0.311 (0.311) Elapsed 108m 56s (remain 346m 17s) Loss: 0.1665(0.0659) Grad: 1.1321  
Epoch: [1][12280/51233] Data 0.315 (0.311) Elapsed 109m 7s (remain 346m 7s) Loss: 0.1964(0.0659) Grad: 0.9770  
Epoch: [1][12300/51233] Data 0.308 (0.311) Elapsed 109m 18s (remain 345m 56s) Loss: 0.0309(0.0658) Grad: 0.6284  
Epoch: [1][12320/51233] Data 0.316 (0.311) Elapsed 109m 28s (remain 345m 45s) Loss: 0.0097(0.0658) Grad: 0.0827  
Epoch: [1][12340/51233] Data 0.309 (0.311) Elapsed 109m 39s (remain 345m 35s) Loss: 0.0416(0.0658) Grad: 0.8940  
Epoch: [1][12360/51233] Data 0.316 (0.311) Elapsed 109m 50s (remain 345m 24s) Loss: 0.0698(0.0658) Grad: 0.7344  
Epoch: [1][12380/51233] Data 0.307 (0.311) Elapsed 110m 0s (remain 345m 13s) Loss: 0.1563(0.0659) Grad: 2.2866  
Epoch: [1][12400/51233] Data 0.312 (0.311) Elapsed 110m 11s (remain 345m 3s) Loss: 0.0223(0.0659) Grad: 0.3848  
Epoch: [1][12420/51233] Data 0.319 (0.311) Elapsed 110m 22s (remain 344m 52s) Loss: 0.0384(0.0659) Grad: 0.4523  
Epoch: [1][12440/51233] Data 0.309 (0.311) Elapsed 110m 32s (remain 344m 41s) Loss: 0.1130(0.0659) Grad: 0.8769  
Epoch: [1][12460/51233] Data 0.310 (0.311) Elapsed 110m 43s (remain 344m 31s) Loss: 0.0080(0.0658) Grad: 0.1819  
Epoch: [1][12480/51233] Data 0.311 (0.311) Elapsed 110m 54s (remain 344m 20s) Loss: 0.0700(0.0658) Grad: 1.2453  
Epoch: [1][12500/51233] Data 0.308 (0.311) Elapsed 111m 4s (remain 344m 9s) Loss: 0.0835(0.0658) Grad: 1.0217  
Epoch: [1][12520/51233] Data 0.318 (0.311) Elapsed 111m 15s (remain 343m 59s) Loss: 0.0668(0.0658) Grad: 0.8810  
Epoch: [1][12540/51233] Data 0.313 (0.311) Elapsed 111m 26s (remain 343m 49s) Loss: 0.2541(0.0658) Grad: 1.6563  
Epoch: [1][12560/51233] Data 0.314 (0.311) Elapsed 111m 36s (remain 343m 37s) Loss: 0.0023(0.0658) Grad: 0.0503  
Epoch: [1][12580/51233] Data 0.319 (0.311) Elapsed 111m 47s (remain 343m 27s) Loss: 0.0546(0.0657) Grad: 0.8086  
Epoch: [1][12600/51233] Data 0.319 (0.311) Elapsed 111m 58s (remain 343m 16s) Loss: 0.0251(0.0657) Grad: 0.5522  
Epoch: [1][12620/51233] Data 0.297 (0.311) Elapsed 112m 8s (remain 343m 5s) Loss: 0.0034(0.0657) Grad: 0.0727  
Epoch: [1][12640/51233] Data 0.314 (0.311) Elapsed 112m 19s (remain 342m 55s) Loss: 0.0182(0.0657) Grad: 0.4202  
Epoch: [1][12660/51233] Data 0.302 (0.311) Elapsed 112m 30s (remain 342m 44s) Loss: 0.0056(0.0657) Grad: 0.1217  
Epoch: [1][12680/51233] Data 0.309 (0.311) Elapsed 112m 40s (remain 342m 33s) Loss: 0.0003(0.0657) Grad: 0.0040  
Epoch: [1][12700/51233] Data 0.311 (0.311) Elapsed 112m 51s (remain 342m 23s) Loss: 0.1490(0.0656) Grad: 1.5027  
Epoch: [1][12720/51233] Data 0.310 (0.311) Elapsed 113m 2s (remain 342m 12s) Loss: 0.4200(0.0657) Grad: 3.1430  
Epoch: [1][12740/51233] Data 0.319 (0.311) Elapsed 113m 12s (remain 342m 1s) Loss: 0.0557(0.0656) Grad: 1.1760  
Epoch: [1][12760/51233] Data 0.309 (0.311) Elapsed 113m 23s (remain 341m 51s) Loss: 0.0048(0.0656) Grad: 0.0614  
Epoch: [1][12780/51233] Data 0.308 (0.311) Elapsed 113m 34s (remain 341m 40s) Loss: 0.0028(0.0656) Grad: 0.0386  
Epoch: [1][12800/51233] Data 0.301 (0.311) Elapsed 113m 44s (remain 341m 29s) Loss: 0.0357(0.0656) Grad: 0.6366  
Epoch: [1][12820/51233] Data 0.310 (0.311) Elapsed 113m 55s (remain 341m 19s) Loss: 0.0570(0.0656) Grad: 0.8024  
Epoch: [1][12840/51233] Data 0.307 (0.311) Elapsed 114m 6s (remain 341m 8s) Loss: 0.0039(0.0656) Grad: 0.0766  
Epoch: [1][12860/51233] Data 0.311 (0.311) Elapsed 114m 16s (remain 340m 57s) Loss: 0.1524(0.0655) Grad: 1.2298  
Epoch: [1][12880/51233] Data 0.318 (0.311) Elapsed 114m 27s (remain 340m 47s) Loss: 0.0108(0.0656) Grad: 0.1668  
Epoch: [1][12900/51233] Data 0.303 (0.311) Elapsed 114m 38s (remain 340m 36s) Loss: 0.0286(0.0656) Grad: 0.4789  
Epoch: [1][12920/51233] Data 0.317 (0.311) Elapsed 114m 48s (remain 340m 25s) Loss: 0.1061(0.0656) Grad: 1.3500  
Epoch: [1][12940/51233] Data 0.306 (0.311) Elapsed 114m 59s (remain 340m 14s) Loss: 0.0270(0.0656) Grad: 0.4583  
Epoch: [1][12960/51233] Data 0.307 (0.311) Elapsed 115m 10s (remain 340m 4s) Loss: 0.0379(0.0656) Grad: 0.6633  
Epoch: [1][12980/51233] Data 0.316 (0.311) Elapsed 115m 20s (remain 339m 53s) Loss: 0.0064(0.0656) Grad: 0.1097  
Epoch: [1][13000/51233] Data 0.308 (0.311) Elapsed 115m 31s (remain 339m 43s) Loss: 0.0515(0.0656) Grad: 0.9398  
Epoch: [1][13020/51233] Data 0.312 (0.311) Elapsed 115m 42s (remain 339m 32s) Loss: 0.0278(0.0656) Grad: 0.4187  
Epoch: [1][13040/51233] Data 0.309 (0.311) Elapsed 115m 52s (remain 339m 21s) Loss: 0.0115(0.0656) Grad: 0.1663  
Epoch: [1][13060/51233] Data 0.311 (0.311) Elapsed 116m 3s (remain 339m 10s) Loss: 0.0065(0.0655) Grad: 0.0862  
Epoch: [1][13080/51233] Data 0.309 (0.311) Elapsed 116m 13s (remain 339m 0s) Loss: 0.5170(0.0656) Grad: 2.5288  
Epoch: [1][13100/51233] Data 0.309 (0.311) Elapsed 116m 24s (remain 338m 49s) Loss: 0.0644(0.0656) Grad: 0.9233  
Epoch: [1][13120/51233] Data 0.318 (0.311) Elapsed 116m 35s (remain 338m 38s) Loss: 0.0293(0.0656) Grad: 1.0055  
Epoch: [1][13140/51233] Data 0.303 (0.311) Elapsed 116m 45s (remain 338m 28s) Loss: 0.0813(0.0656) Grad: 1.5801  
Epoch: [1][13160/51233] Data 0.317 (0.311) Elapsed 116m 56s (remain 338m 17s) Loss: 0.0302(0.0656) Grad: 0.4189  
Epoch: [1][13180/51233] Data 0.302 (0.311) Elapsed 117m 7s (remain 338m 6s) Loss: 0.2435(0.0656) Grad: 2.0438  
Epoch: [1][13200/51233] Data 0.308 (0.311) Elapsed 117m 17s (remain 337m 56s) Loss: 0.1626(0.0656) Grad: 1.8987  
Epoch: [1][13220/51233] Data 0.311 (0.311) Elapsed 117m 28s (remain 337m 45s) Loss: 0.0505(0.0656) Grad: 0.9230  
Epoch: [1][13240/51233] Data 0.300 (0.311) Elapsed 117m 39s (remain 337m 34s) Loss: 0.2199(0.0656) Grad: 2.2401  
Epoch: [1][13260/51233] Data 0.312 (0.311) Elapsed 117m 49s (remain 337m 24s) Loss: 0.0007(0.0656) Grad: 0.0137  
Epoch: [1][13280/51233] Data 0.310 (0.311) Elapsed 118m 0s (remain 337m 13s) Loss: 0.4149(0.0656) Grad: 2.3570  
Epoch: [1][13300/51233] Data 0.316 (0.311) Elapsed 118m 11s (remain 337m 2s) Loss: 0.0571(0.0656) Grad: 0.9490  
Epoch: [1][13320/51233] Data 0.291 (0.311) Elapsed 118m 21s (remain 336m 52s) Loss: 0.0390(0.0656) Grad: 0.6591  
Epoch: [1][13340/51233] Data 0.311 (0.311) Elapsed 118m 32s (remain 336m 41s) Loss: 0.0042(0.0655) Grad: 0.1005  
Epoch: [1][13360/51233] Data 0.318 (0.311) Elapsed 118m 43s (remain 336m 30s) Loss: 0.0024(0.0655) Grad: 0.0390  
Epoch: [1][13380/51233] Data 0.308 (0.311) Elapsed 118m 53s (remain 336m 20s) Loss: 0.1444(0.0654) Grad: 1.6739  
Epoch: [1][13400/51233] Data 0.302 (0.311) Elapsed 119m 4s (remain 336m 9s) Loss: 0.0774(0.0655) Grad: 1.4776  
Epoch: [1][13420/51233] Data 0.309 (0.311) Elapsed 119m 15s (remain 335m 58s) Loss: 0.0139(0.0654) Grad: 0.2271  
Epoch: [1][13440/51233] Data 0.308 (0.311) Elapsed 119m 25s (remain 335m 48s) Loss: 0.0356(0.0654) Grad: 0.4230  
Epoch: [1][13460/51233] Data 0.305 (0.311) Elapsed 119m 36s (remain 335m 37s) Loss: 0.0023(0.0653) Grad: 0.0525  
Epoch: [1][13480/51233] Data 0.308 (0.311) Elapsed 119m 47s (remain 335m 26s) Loss: 0.0410(0.0653) Grad: 0.9897  
Epoch: [1][13500/51233] Data 0.309 (0.311) Elapsed 119m 57s (remain 335m 16s) Loss: 0.1833(0.0653) Grad: 1.0601  
Epoch: [1][13520/51233] Data 0.318 (0.311) Elapsed 120m 8s (remain 335m 5s) Loss: 0.0607(0.0653) Grad: 0.7986  
Epoch: [1][13540/51233] Data 0.309 (0.311) Elapsed 120m 19s (remain 334m 54s) Loss: 0.0612(0.0653) Grad: 1.1859  
Epoch: [1][13560/51233] Data 0.310 (0.311) Elapsed 120m 29s (remain 334m 44s) Loss: 0.0082(0.0653) Grad: 0.1979  
Epoch: [1][13580/51233] Data 0.316 (0.311) Elapsed 120m 40s (remain 334m 33s) Loss: 0.0091(0.0653) Grad: 0.1716  
Epoch: [1][13600/51233] Data 0.314 (0.311) Elapsed 120m 51s (remain 334m 22s) Loss: 0.0115(0.0652) Grad: 0.1888  
Epoch: [1][13620/51233] Data 0.305 (0.311) Elapsed 121m 1s (remain 334m 12s) Loss: 0.0401(0.0652) Grad: 0.7167  
Epoch: [1][13640/51233] Data 0.315 (0.311) Elapsed 121m 12s (remain 334m 1s) Loss: 0.5896(0.0653) Grad: 3.7380  
Epoch: [1][13660/51233] Data 0.309 (0.311) Elapsed 121m 23s (remain 333m 50s) Loss: 0.1318(0.0653) Grad: 1.6950  
Epoch: [1][13680/51233] Data 0.318 (0.311) Elapsed 121m 33s (remain 333m 40s) Loss: 0.1202(0.0653) Grad: 1.2390  
Epoch: [1][13700/51233] Data 0.318 (0.311) Elapsed 121m 44s (remain 333m 29s) Loss: 0.0263(0.0653) Grad: 0.4724  
Epoch: [1][13720/51233] Data 0.309 (0.311) Elapsed 121m 55s (remain 333m 18s) Loss: 0.0237(0.0652) Grad: 0.5714  
Epoch: [1][13740/51233] Data 0.307 (0.311) Elapsed 122m 5s (remain 333m 8s) Loss: 0.0031(0.0652) Grad: 0.0641  
Epoch: [1][13760/51233] Data 0.309 (0.311) Elapsed 122m 16s (remain 332m 57s) Loss: 0.3830(0.0652) Grad: 2.4993  
Epoch: [1][13780/51233] Data 0.318 (0.311) Elapsed 122m 27s (remain 332m 46s) Loss: 0.0413(0.0652) Grad: 0.9968  
Epoch: [1][13800/51233] Data 0.309 (0.311) Elapsed 122m 37s (remain 332m 36s) Loss: 0.0247(0.0652) Grad: 0.4473  
Epoch: [1][13820/51233] Data 0.318 (0.311) Elapsed 122m 48s (remain 332m 25s) Loss: 0.0351(0.0652) Grad: 0.3541  
Epoch: [1][13840/51233] Data 0.307 (0.311) Elapsed 122m 59s (remain 332m 14s) Loss: 0.1933(0.0652) Grad: 1.4513  
Epoch: [1][13860/51233] Data 0.297 (0.311) Elapsed 123m 9s (remain 332m 4s) Loss: 0.0267(0.0652) Grad: 0.2611  
Epoch: [1][13880/51233] Data 0.315 (0.311) Elapsed 123m 20s (remain 331m 53s) Loss: 0.0145(0.0652) Grad: 0.1960  
Epoch: [1][13900/51233] Data 0.302 (0.311) Elapsed 123m 30s (remain 331m 42s) Loss: 0.0140(0.0652) Grad: 0.1484  
Epoch: [1][13920/51233] Data 0.313 (0.311) Elapsed 123m 41s (remain 331m 32s) Loss: 0.0048(0.0653) Grad: 0.1139  
Epoch: [1][13940/51233] Data 0.306 (0.311) Elapsed 123m 52s (remain 331m 21s) Loss: 0.0160(0.0653) Grad: 0.3496  
Epoch: [1][13960/51233] Data 0.304 (0.311) Elapsed 124m 2s (remain 331m 10s) Loss: 0.0162(0.0653) Grad: 0.2653  
Epoch: [1][13980/51233] Data 0.316 (0.311) Elapsed 124m 13s (remain 331m 0s) Loss: 0.1316(0.0652) Grad: 1.9459  
Epoch: [1][14000/51233] Data 0.309 (0.311) Elapsed 124m 24s (remain 330m 49s) Loss: 0.0078(0.0652) Grad: 0.1164  
Epoch: [1][14020/51233] Data 0.306 (0.311) Elapsed 124m 34s (remain 330m 38s) Loss: 0.0324(0.0652) Grad: 0.5593  
Epoch: [1][14040/51233] Data 0.308 (0.311) Elapsed 124m 45s (remain 330m 28s) Loss: 0.0065(0.0652) Grad: 0.1208  
Epoch: [1][14060/51233] Data 0.310 (0.311) Elapsed 124m 56s (remain 330m 17s) Loss: 0.0456(0.0652) Grad: 0.7004  
Epoch: [1][14080/51233] Data 0.308 (0.311) Elapsed 125m 6s (remain 330m 6s) Loss: 0.1896(0.0653) Grad: 1.4492  
Epoch: [1][14100/51233] Data 0.319 (0.311) Elapsed 125m 17s (remain 329m 56s) Loss: 0.0199(0.0653) Grad: 0.2595  
Epoch: [1][14120/51233] Data 0.317 (0.311) Elapsed 125m 28s (remain 329m 45s) Loss: 0.1471(0.0653) Grad: 1.6733  
Epoch: [1][14140/51233] Data 0.299 (0.311) Elapsed 125m 38s (remain 329m 34s) Loss: 0.0212(0.0653) Grad: 0.4003  
Epoch: [1][14160/51233] Data 0.309 (0.311) Elapsed 125m 49s (remain 329m 23s) Loss: 0.0236(0.0653) Grad: 0.4668  
Epoch: [1][14180/51233] Data 0.308 (0.311) Elapsed 126m 0s (remain 329m 13s) Loss: 0.0318(0.0652) Grad: 0.4070  
Epoch: [1][14200/51233] Data 0.304 (0.311) Elapsed 126m 10s (remain 329m 2s) Loss: 0.0046(0.0652) Grad: 0.0742  
Epoch: [1][14220/51233] Data 0.307 (0.311) Elapsed 126m 21s (remain 328m 51s) Loss: 0.0395(0.0653) Grad: 0.7238  
Epoch: [1][14240/51233] Data 0.306 (0.311) Elapsed 126m 32s (remain 328m 41s) Loss: 0.0080(0.0653) Grad: 0.1841  
Epoch: [1][14260/51233] Data 0.317 (0.311) Elapsed 126m 42s (remain 328m 30s) Loss: 0.0377(0.0653) Grad: 0.4379  
Epoch: [1][14280/51233] Data 0.310 (0.311) Elapsed 126m 53s (remain 328m 19s) Loss: 0.5624(0.0653) Grad: 2.2407  
Epoch: [1][14300/51233] Data 0.318 (0.311) Elapsed 127m 4s (remain 328m 9s) Loss: 0.2357(0.0653) Grad: 2.1302  
Epoch: [1][14320/51233] Data 0.310 (0.311) Elapsed 127m 14s (remain 327m 58s) Loss: 0.3523(0.0653) Grad: 1.9757  
Epoch: [1][14340/51233] Data 0.318 (0.311) Elapsed 127m 25s (remain 327m 47s) Loss: 0.0112(0.0653) Grad: 0.3087  
Epoch: [1][14360/51233] Data 0.310 (0.311) Elapsed 127m 36s (remain 327m 37s) Loss: 0.1860(0.0653) Grad: 1.0803  
Epoch: [1][14380/51233] Data 0.316 (0.311) Elapsed 127m 46s (remain 327m 26s) Loss: 0.0433(0.0654) Grad: 0.5014  
Epoch: [1][14400/51233] Data 0.318 (0.311) Elapsed 127m 57s (remain 327m 15s) Loss: 0.0261(0.0654) Grad: 0.2581  
Epoch: [1][14420/51233] Data 0.308 (0.311) Elapsed 128m 8s (remain 327m 5s) Loss: 0.0503(0.0654) Grad: 0.6617  
Epoch: [1][14440/51233] Data 0.317 (0.311) Elapsed 128m 18s (remain 326m 54s) Loss: 0.0069(0.0654) Grad: 0.1866  
Epoch: [1][14460/51233] Data 0.309 (0.311) Elapsed 128m 29s (remain 326m 43s) Loss: 0.1377(0.0654) Grad: 1.7070  
Epoch: [1][14480/51233] Data 0.308 (0.311) Elapsed 128m 40s (remain 326m 33s) Loss: 0.0080(0.0654) Grad: 0.1277  
Epoch: [1][14500/51233] Data 0.317 (0.311) Elapsed 128m 50s (remain 326m 22s) Loss: 0.0348(0.0654) Grad: 0.7902  
Epoch: [1][14520/51233] Data 0.309 (0.311) Elapsed 129m 1s (remain 326m 11s) Loss: 0.0083(0.0653) Grad: 0.0847  
Epoch: [1][14540/51233] Data 0.319 (0.311) Elapsed 129m 12s (remain 326m 1s) Loss: 0.4757(0.0653) Grad: 1.3034  
Epoch: [1][14560/51233] Data 0.307 (0.311) Elapsed 129m 22s (remain 325m 50s) Loss: 0.0451(0.0653) Grad: 0.6344  
Epoch: [1][14580/51233] Data 0.308 (0.311) Elapsed 129m 33s (remain 325m 39s) Loss: 0.0410(0.0653) Grad: 0.7351  
Epoch: [1][14600/51233] Data 0.298 (0.311) Elapsed 129m 44s (remain 325m 29s) Loss: 0.0155(0.0653) Grad: 0.3006  
Epoch: [1][14620/51233] Data 0.302 (0.311) Elapsed 129m 54s (remain 325m 18s) Loss: 0.4137(0.0653) Grad: 2.0466  
Epoch: [1][14640/51233] Data 0.302 (0.311) Elapsed 130m 5s (remain 325m 7s) Loss: 0.0189(0.0653) Grad: 0.2646  
Epoch: [1][14660/51233] Data 0.313 (0.311) Elapsed 130m 16s (remain 324m 57s) Loss: 0.0116(0.0653) Grad: 0.1469  
Epoch: [1][14680/51233] Data 0.308 (0.311) Elapsed 130m 26s (remain 324m 46s) Loss: 0.0152(0.0654) Grad: 0.3008  
Epoch: [1][14700/51233] Data 0.311 (0.311) Elapsed 130m 37s (remain 324m 35s) Loss: 0.0220(0.0653) Grad: 0.2860  
Epoch: [1][14720/51233] Data 0.318 (0.311) Elapsed 130m 47s (remain 324m 25s) Loss: 0.0607(0.0653) Grad: 0.8280  
Epoch: [1][14740/51233] Data 0.305 (0.311) Elapsed 130m 58s (remain 324m 14s) Loss: 0.0040(0.0653) Grad: 0.0920  
Epoch: [1][14760/51233] Data 0.317 (0.311) Elapsed 131m 9s (remain 324m 3s) Loss: 0.0237(0.0653) Grad: 0.5389  
Epoch: [1][14780/51233] Data 0.318 (0.311) Elapsed 131m 19s (remain 323m 53s) Loss: 0.2909(0.0653) Grad: 1.7249  
Epoch: [1][14800/51233] Data 0.308 (0.311) Elapsed 131m 30s (remain 323m 42s) Loss: 0.1239(0.0653) Grad: 1.5082  
Epoch: [1][14820/51233] Data 0.309 (0.311) Elapsed 131m 41s (remain 323m 31s) Loss: 0.0019(0.0653) Grad: 0.0414  
Epoch: [1][14840/51233] Data 0.309 (0.311) Elapsed 131m 51s (remain 323m 21s) Loss: 0.3068(0.0653) Grad: 3.7372  
Epoch: [1][14860/51233] Data 0.301 (0.311) Elapsed 132m 2s (remain 323m 10s) Loss: 0.0061(0.0652) Grad: 0.1026  
Epoch: [1][14880/51233] Data 0.296 (0.311) Elapsed 132m 13s (remain 322m 59s) Loss: 0.0627(0.0652) Grad: 0.9642  
Epoch: [1][14900/51233] Data 0.303 (0.311) Elapsed 132m 23s (remain 322m 49s) Loss: 0.0097(0.0652) Grad: 0.0931  
Epoch: [1][14920/51233] Data 0.310 (0.311) Elapsed 132m 34s (remain 322m 38s) Loss: 0.2346(0.0652) Grad: 1.3889  
Epoch: [1][14940/51233] Data 0.318 (0.311) Elapsed 132m 45s (remain 322m 27s) Loss: 0.0215(0.0652) Grad: 0.4594  
Epoch: [1][14960/51233] Data 0.315 (0.311) Elapsed 132m 55s (remain 322m 17s) Loss: 0.0735(0.0652) Grad: 0.9967  
Epoch: [1][14980/51233] Data 0.302 (0.311) Elapsed 133m 6s (remain 322m 6s) Loss: 0.0236(0.0651) Grad: 0.6818  
Epoch: [1][15000/51233] Data 0.309 (0.311) Elapsed 133m 17s (remain 321m 55s) Loss: 0.0021(0.0651) Grad: 0.0220  
Epoch: [1][15020/51233] Data 0.317 (0.311) Elapsed 133m 27s (remain 321m 45s) Loss: 0.1422(0.0651) Grad: 1.8768  
Epoch: [1][15040/51233] Data 0.312 (0.311) Elapsed 133m 38s (remain 321m 34s) Loss: 0.0154(0.0651) Grad: 0.2157  
Epoch: [1][15060/51233] Data 0.316 (0.311) Elapsed 133m 49s (remain 321m 23s) Loss: 0.1112(0.0651) Grad: 1.4019  
Epoch: [1][15080/51233] Data 0.309 (0.311) Elapsed 133m 59s (remain 321m 13s) Loss: 0.0388(0.0651) Grad: 0.7347  
Epoch: [1][15100/51233] Data 0.317 (0.311) Elapsed 134m 10s (remain 321m 2s) Loss: 0.1002(0.0650) Grad: 1.9901  
Epoch: [1][15120/51233] Data 0.308 (0.311) Elapsed 134m 21s (remain 320m 51s) Loss: 0.0687(0.0650) Grad: 0.7216  
Epoch: [1][15140/51233] Data 0.296 (0.311) Elapsed 134m 31s (remain 320m 41s) Loss: 0.1946(0.0650) Grad: 2.6212  
Epoch: [1][15160/51233] Data 0.304 (0.311) Elapsed 134m 42s (remain 320m 30s) Loss: 0.0493(0.0650) Grad: 0.9598  
Epoch: [1][15180/51233] Data 0.290 (0.311) Elapsed 134m 53s (remain 320m 19s) Loss: 0.0015(0.0650) Grad: 0.0186  
Epoch: [1][15200/51233] Data 0.318 (0.311) Elapsed 135m 3s (remain 320m 9s) Loss: 0.0043(0.0649) Grad: 0.1178  
Epoch: [1][15220/51233] Data 0.304 (0.311) Elapsed 135m 14s (remain 319m 58s) Loss: 0.0111(0.0649) Grad: 0.2203  
Epoch: [1][15240/51233] Data 0.315 (0.311) Elapsed 135m 25s (remain 319m 47s) Loss: 0.0013(0.0649) Grad: 0.0190  
Epoch: [1][15260/51233] Data 0.317 (0.311) Elapsed 135m 35s (remain 319m 37s) Loss: 0.0169(0.0649) Grad: 0.5813  
Epoch: [1][15280/51233] Data 0.313 (0.311) Elapsed 135m 46s (remain 319m 26s) Loss: 0.0040(0.0649) Grad: 0.0531  
Epoch: [1][15300/51233] Data 0.317 (0.311) Elapsed 135m 57s (remain 319m 15s) Loss: 0.0070(0.0649) Grad: 0.1377  
Epoch: [1][15320/51233] Data 0.309 (0.311) Elapsed 136m 7s (remain 319m 5s) Loss: 0.0382(0.0648) Grad: 0.8381  
Epoch: [1][15340/51233] Data 0.309 (0.311) Elapsed 136m 18s (remain 318m 54s) Loss: 0.0004(0.0648) Grad: 0.0073  
Epoch: [1][15360/51233] Data 0.311 (0.311) Elapsed 136m 29s (remain 318m 43s) Loss: 0.0021(0.0648) Grad: 0.0291  
Epoch: [1][15380/51233] Data 0.318 (0.311) Elapsed 136m 39s (remain 318m 32s) Loss: 0.0073(0.0648) Grad: 0.1213  
Epoch: [1][15400/51233] Data 0.313 (0.311) Elapsed 136m 50s (remain 318m 22s) Loss: 0.0428(0.0648) Grad: 1.1139  
Epoch: [1][15420/51233] Data 0.307 (0.311) Elapsed 137m 1s (remain 318m 11s) Loss: 0.0151(0.0648) Grad: 0.2375  
Epoch: [1][15440/51233] Data 0.305 (0.311) Elapsed 137m 11s (remain 318m 0s) Loss: 0.2796(0.0648) Grad: 3.9331  
Epoch: [1][15460/51233] Data 0.317 (0.311) Elapsed 137m 22s (remain 317m 50s) Loss: 0.0347(0.0648) Grad: 0.6764  
Epoch: [1][15480/51233] Data 0.317 (0.311) Elapsed 137m 33s (remain 317m 39s) Loss: 0.0058(0.0648) Grad: 0.1609  
Epoch: [1][15500/51233] Data 0.318 (0.311) Elapsed 137m 43s (remain 317m 28s) Loss: 0.0010(0.0648) Grad: 0.0140  
Epoch: [1][15520/51233] Data 0.317 (0.311) Elapsed 137m 54s (remain 317m 18s) Loss: 0.0037(0.0648) Grad: 0.0595  
Epoch: [1][15540/51233] Data 0.309 (0.311) Elapsed 138m 4s (remain 317m 7s) Loss: 0.1122(0.0649) Grad: 1.2433  
Epoch: [1][15560/51233] Data 0.317 (0.311) Elapsed 138m 15s (remain 316m 56s) Loss: 0.0522(0.0649) Grad: 0.7992  
Epoch: [1][15580/51233] Data 0.309 (0.311) Elapsed 138m 26s (remain 316m 46s) Loss: 0.0808(0.0649) Grad: 1.3325  
Epoch: [1][15600/51233] Data 0.318 (0.311) Elapsed 138m 36s (remain 316m 35s) Loss: 0.3066(0.0649) Grad: 2.8542  
Epoch: [1][15620/51233] Data 0.308 (0.311) Elapsed 138m 47s (remain 316m 24s) Loss: 0.0375(0.0649) Grad: 0.5860  
Epoch: [1][15640/51233] Data 0.313 (0.311) Elapsed 138m 58s (remain 316m 14s) Loss: 0.0819(0.0649) Grad: 1.2173  
Epoch: [1][15660/51233] Data 0.310 (0.311) Elapsed 139m 8s (remain 316m 3s) Loss: 0.0024(0.0649) Grad: 0.0329  
Epoch: [1][15680/51233] Data 0.309 (0.311) Elapsed 139m 19s (remain 315m 52s) Loss: 0.0150(0.0649) Grad: 0.1781  
Epoch: [1][15700/51233] Data 0.309 (0.311) Elapsed 139m 30s (remain 315m 42s) Loss: 0.1681(0.0649) Grad: 2.0697  
Epoch: [1][15720/51233] Data 0.311 (0.311) Elapsed 139m 40s (remain 315m 31s) Loss: 0.0124(0.0650) Grad: 0.1038  
Epoch: [1][15740/51233] Data 0.312 (0.311) Elapsed 139m 51s (remain 315m 20s) Loss: 0.1146(0.0649) Grad: 1.2520  
Epoch: [1][15760/51233] Data 0.306 (0.311) Elapsed 140m 2s (remain 315m 10s) Loss: 0.0114(0.0649) Grad: 0.2229  
Epoch: [1][15780/51233] Data 0.308 (0.311) Elapsed 140m 12s (remain 314m 59s) Loss: 0.0072(0.0649) Grad: 0.0838  
Epoch: [1][15800/51233] Data 0.318 (0.311) Elapsed 140m 23s (remain 314m 48s) Loss: 0.0310(0.0649) Grad: 0.5149  
Epoch: [1][15820/51233] Data 0.312 (0.311) Elapsed 140m 34s (remain 314m 38s) Loss: 0.0074(0.0649) Grad: 0.1242  
Epoch: [1][15840/51233] Data 0.306 (0.311) Elapsed 140m 44s (remain 314m 27s) Loss: 0.0536(0.0649) Grad: 1.1773  
Epoch: [1][15860/51233] Data 0.307 (0.311) Elapsed 140m 55s (remain 314m 16s) Loss: 0.0391(0.0649) Grad: 0.4141  
Epoch: [1][15880/51233] Data 0.315 (0.311) Elapsed 141m 6s (remain 314m 6s) Loss: 0.0056(0.0649) Grad: 0.0473  
Epoch: [1][15900/51233] Data 0.308 (0.311) Elapsed 141m 16s (remain 313m 55s) Loss: 0.2094(0.0649) Grad: 1.6596  
Epoch: [1][15920/51233] Data 0.310 (0.311) Elapsed 141m 27s (remain 313m 44s) Loss: 0.0025(0.0649) Grad: 0.0268  
Epoch: [1][15940/51233] Data 0.318 (0.311) Elapsed 141m 38s (remain 313m 34s) Loss: 0.0773(0.0649) Grad: 1.4320  
Epoch: [1][15960/51233] Data 0.317 (0.311) Elapsed 141m 48s (remain 313m 23s) Loss: 0.1260(0.0649) Grad: 1.5081  
Epoch: [1][15980/51233] Data 0.317 (0.311) Elapsed 141m 59s (remain 313m 12s) Loss: 0.0833(0.0649) Grad: 0.9562  
Epoch: [1][16000/51233] Data 0.311 (0.311) Elapsed 142m 10s (remain 313m 2s) Loss: 0.0094(0.0649) Grad: 0.1090  
Epoch: [1][16020/51233] Data 0.318 (0.311) Elapsed 142m 20s (remain 312m 51s) Loss: 0.0120(0.0649) Grad: 0.1477  
Epoch: [1][16040/51233] Data 0.299 (0.311) Elapsed 142m 31s (remain 312m 40s) Loss: 0.0035(0.0648) Grad: 0.0555  
Epoch: [1][16060/51233] Data 0.308 (0.311) Elapsed 142m 42s (remain 312m 30s) Loss: 0.0082(0.0648) Grad: 0.2207  
Epoch: [1][16080/51233] Data 0.309 (0.311) Elapsed 142m 52s (remain 312m 19s) Loss: 0.0021(0.0648) Grad: 0.0331  
Epoch: [1][16100/51233] Data 0.308 (0.311) Elapsed 143m 3s (remain 312m 8s) Loss: 0.0286(0.0648) Grad: 0.4158  
Epoch: [1][16120/51233] Data 0.309 (0.311) Elapsed 143m 14s (remain 311m 58s) Loss: 0.2219(0.0648) Grad: 1.9550  
Epoch: [1][16140/51233] Data 0.317 (0.311) Elapsed 143m 24s (remain 311m 47s) Loss: 0.0289(0.0648) Grad: 0.3433  
Epoch: [1][16160/51233] Data 0.318 (0.311) Elapsed 143m 35s (remain 311m 36s) Loss: 0.1070(0.0648) Grad: 0.9042  
Epoch: [1][16180/51233] Data 0.309 (0.311) Elapsed 143m 45s (remain 311m 25s) Loss: 0.0809(0.0648) Grad: 1.2604  
Epoch: [1][16200/51233] Data 0.296 (0.311) Elapsed 143m 56s (remain 311m 15s) Loss: 0.0026(0.0648) Grad: 0.0286  
Epoch: [1][16220/51233] Data 0.316 (0.311) Elapsed 144m 7s (remain 311m 4s) Loss: 0.0055(0.0648) Grad: 0.0775  
Epoch: [1][16240/51233] Data 0.317 (0.311) Elapsed 144m 17s (remain 310m 53s) Loss: 0.0201(0.0648) Grad: 0.3987  
Epoch: [1][16260/51233] Data 0.314 (0.311) Elapsed 144m 28s (remain 310m 43s) Loss: 0.1882(0.0648) Grad: 1.5341  
Epoch: [1][16280/51233] Data 0.311 (0.311) Elapsed 144m 39s (remain 310m 32s) Loss: 0.0057(0.0648) Grad: 0.0848  
Epoch: [1][16300/51233] Data 0.315 (0.311) Elapsed 144m 49s (remain 310m 21s) Loss: 0.1320(0.0648) Grad: 1.3808  
Epoch: [1][16320/51233] Data 0.304 (0.311) Elapsed 145m 0s (remain 310m 11s) Loss: 0.0405(0.0648) Grad: 0.5577  
Epoch: [1][16340/51233] Data 0.308 (0.311) Elapsed 145m 11s (remain 310m 0s) Loss: 0.1615(0.0648) Grad: 1.5849  
Epoch: [1][16360/51233] Data 0.318 (0.311) Elapsed 145m 21s (remain 309m 49s) Loss: 0.0525(0.0648) Grad: 0.8731  
Epoch: [1][16380/51233] Data 0.308 (0.311) Elapsed 145m 32s (remain 309m 39s) Loss: 0.0064(0.0647) Grad: 0.0942  
Epoch: [1][16400/51233] Data 0.309 (0.311) Elapsed 145m 43s (remain 309m 28s) Loss: 0.0051(0.0647) Grad: 0.0992  
Epoch: [1][16420/51233] Data 0.308 (0.311) Elapsed 145m 53s (remain 309m 17s) Loss: 0.0019(0.0647) Grad: 0.0236  
Epoch: [1][16440/51233] Data 0.304 (0.311) Elapsed 146m 4s (remain 309m 7s) Loss: 0.0072(0.0647) Grad: 0.1198  
Epoch: [1][16460/51233] Data 0.308 (0.311) Elapsed 146m 15s (remain 308m 56s) Loss: 0.0268(0.0647) Grad: 0.6553  
Epoch: [1][16480/51233] Data 0.308 (0.311) Elapsed 146m 25s (remain 308m 45s) Loss: 0.0077(0.0648) Grad: 0.0693  
Epoch: [1][16500/51233] Data 0.318 (0.311) Elapsed 146m 36s (remain 308m 35s) Loss: 0.0093(0.0648) Grad: 0.0866  
Epoch: [1][16520/51233] Data 0.312 (0.311) Elapsed 146m 47s (remain 308m 24s) Loss: 0.0304(0.0648) Grad: 0.4091  
Epoch: [1][16540/51233] Data 0.316 (0.311) Elapsed 146m 57s (remain 308m 13s) Loss: 0.0646(0.0647) Grad: 1.4290  
Epoch: [1][16560/51233] Data 0.317 (0.311) Elapsed 147m 8s (remain 308m 3s) Loss: 0.0852(0.0647) Grad: 2.4437  
Epoch: [1][16580/51233] Data 0.299 (0.311) Elapsed 147m 19s (remain 307m 52s) Loss: 0.0519(0.0647) Grad: 0.7372  
Epoch: [1][16600/51233] Data 0.317 (0.311) Elapsed 147m 29s (remain 307m 41s) Loss: 0.0618(0.0647) Grad: 1.3912  
Epoch: [1][16620/51233] Data 0.317 (0.311) Elapsed 147m 40s (remain 307m 31s) Loss: 0.0348(0.0647) Grad: 0.4981  
Epoch: [1][16640/51233] Data 0.317 (0.311) Elapsed 147m 51s (remain 307m 20s) Loss: 0.0236(0.0647) Grad: 0.4265  
Epoch: [1][16660/51233] Data 0.309 (0.311) Elapsed 148m 1s (remain 307m 9s) Loss: 0.1144(0.0647) Grad: 1.6309  
Epoch: [1][16680/51233] Data 0.308 (0.311) Elapsed 148m 12s (remain 306m 59s) Loss: 0.0373(0.0647) Grad: 0.8039  
Epoch: [1][16700/51233] Data 0.308 (0.311) Elapsed 148m 23s (remain 306m 48s) Loss: 0.0019(0.0647) Grad: 0.0317  
Epoch: [1][16720/51233] Data 0.313 (0.311) Elapsed 148m 33s (remain 306m 37s) Loss: 0.0292(0.0647) Grad: 0.2898  
Epoch: [1][16740/51233] Data 0.318 (0.311) Elapsed 148m 44s (remain 306m 27s) Loss: 0.0361(0.0647) Grad: 0.3662  
Epoch: [1][16760/51233] Data 0.317 (0.311) Elapsed 148m 55s (remain 306m 16s) Loss: 0.0083(0.0647) Grad: 0.1957  
Epoch: [1][16780/51233] Data 0.308 (0.311) Elapsed 149m 5s (remain 306m 5s) Loss: 0.0300(0.0647) Grad: 0.3497  
Epoch: [1][16800/51233] Data 0.309 (0.311) Elapsed 149m 16s (remain 305m 55s) Loss: 0.0292(0.0647) Grad: 0.7029  
Epoch: [1][16820/51233] Data 0.296 (0.311) Elapsed 149m 27s (remain 305m 44s) Loss: 0.0097(0.0647) Grad: 0.1434  
Epoch: [1][16840/51233] Data 0.300 (0.311) Elapsed 149m 37s (remain 305m 33s) Loss: 0.0016(0.0647) Grad: 0.0167  
Epoch: [1][16860/51233] Data 0.315 (0.311) Elapsed 149m 48s (remain 305m 23s) Loss: 0.0379(0.0647) Grad: 0.5433  
Epoch: [1][16880/51233] Data 0.309 (0.311) Elapsed 149m 59s (remain 305m 12s) Loss: 0.0100(0.0646) Grad: 0.1463  
Epoch: [1][16900/51233] Data 0.317 (0.311) Elapsed 150m 9s (remain 305m 1s) Loss: 0.0032(0.0646) Grad: 0.0290  
Epoch: [1][16920/51233] Data 0.309 (0.311) Elapsed 150m 20s (remain 304m 51s) Loss: 0.0543(0.0647) Grad: 1.0897  
Epoch: [1][16940/51233] Data 0.313 (0.311) Elapsed 150m 30s (remain 304m 40s) Loss: 0.0425(0.0646) Grad: 0.7405  
Epoch: [1][16960/51233] Data 0.308 (0.311) Elapsed 150m 41s (remain 304m 29s) Loss: 0.0101(0.0646) Grad: 0.1251  
Epoch: [1][16980/51233] Data 0.317 (0.311) Elapsed 150m 52s (remain 304m 19s) Loss: 0.0126(0.0647) Grad: 0.1614  
Epoch: [1][17000/51233] Data 0.307 (0.311) Elapsed 151m 2s (remain 304m 8s) Loss: 0.0595(0.0647) Grad: 1.4269  
Epoch: [1][17020/51233] Data 0.317 (0.311) Elapsed 151m 13s (remain 303m 57s) Loss: 0.0796(0.0646) Grad: 1.0200  
Epoch: [1][17040/51233] Data 0.318 (0.311) Elapsed 151m 24s (remain 303m 47s) Loss: 0.0007(0.0646) Grad: 0.0071  
Epoch: [1][17060/51233] Data 0.317 (0.311) Elapsed 151m 34s (remain 303m 36s) Loss: 0.3332(0.0646) Grad: 1.1915  
Epoch: [1][17080/51233] Data 0.305 (0.311) Elapsed 151m 45s (remain 303m 25s) Loss: 0.1111(0.0646) Grad: 1.5564  
Epoch: [1][17100/51233] Data 0.306 (0.311) Elapsed 151m 56s (remain 303m 15s) Loss: 0.0031(0.0646) Grad: 0.0444  
Epoch: [1][17120/51233] Data 0.318 (0.311) Elapsed 152m 6s (remain 303m 4s) Loss: 0.0607(0.0646) Grad: 1.1437  
Epoch: [1][17140/51233] Data 0.304 (0.311) Elapsed 152m 17s (remain 302m 53s) Loss: 0.0066(0.0645) Grad: 0.2637  
Epoch: [1][17160/51233] Data 0.309 (0.311) Elapsed 152m 28s (remain 302m 43s) Loss: 0.0847(0.0645) Grad: 1.5113  
Epoch: [1][17180/51233] Data 0.310 (0.311) Elapsed 152m 38s (remain 302m 32s) Loss: 0.0456(0.0645) Grad: 0.8602  
Epoch: [1][17200/51233] Data 0.310 (0.311) Elapsed 152m 49s (remain 302m 21s) Loss: 0.1189(0.0646) Grad: 1.3356  
Epoch: [1][17220/51233] Data 0.318 (0.311) Elapsed 153m 0s (remain 302m 11s) Loss: 0.0178(0.0646) Grad: 0.3934  
Epoch: [1][17240/51233] Data 0.317 (0.311) Elapsed 153m 10s (remain 302m 0s) Loss: 0.1669(0.0645) Grad: 1.1536  
Epoch: [1][17260/51233] Data 0.305 (0.311) Elapsed 153m 21s (remain 301m 49s) Loss: 0.0011(0.0645) Grad: 0.0160  
Epoch: [1][17280/51233] Data 0.308 (0.311) Elapsed 153m 32s (remain 301m 39s) Loss: 0.0036(0.0645) Grad: 0.0359  
Epoch: [1][17300/51233] Data 0.307 (0.311) Elapsed 153m 42s (remain 301m 28s) Loss: 0.0268(0.0645) Grad: 0.3301  
Epoch: [1][17320/51233] Data 0.310 (0.311) Elapsed 153m 53s (remain 301m 17s) Loss: 0.0860(0.0645) Grad: 1.5225  
Epoch: [1][17340/51233] Data 0.317 (0.311) Elapsed 154m 4s (remain 301m 7s) Loss: 0.0043(0.0645) Grad: 0.0538  
Epoch: [1][17360/51233] Data 0.309 (0.311) Elapsed 154m 14s (remain 300m 56s) Loss: 0.0384(0.0645) Grad: 0.4874  
Epoch: [1][17380/51233] Data 0.311 (0.311) Elapsed 154m 25s (remain 300m 45s) Loss: 0.1961(0.0645) Grad: 1.6679  
Epoch: [1][17400/51233] Data 0.309 (0.311) Elapsed 154m 36s (remain 300m 35s) Loss: 0.0088(0.0645) Grad: 0.1472  
Epoch: [1][17420/51233] Data 0.296 (0.311) Elapsed 154m 46s (remain 300m 24s) Loss: 0.1391(0.0645) Grad: 2.0711  
Epoch: [1][17440/51233] Data 0.308 (0.311) Elapsed 154m 57s (remain 300m 13s) Loss: 0.0220(0.0645) Grad: 0.3590  
Epoch: [1][17460/51233] Data 0.308 (0.311) Elapsed 155m 8s (remain 300m 3s) Loss: 0.1795(0.0644) Grad: 2.7803  
Epoch: [1][17480/51233] Data 0.312 (0.311) Elapsed 155m 18s (remain 299m 52s) Loss: 0.1478(0.0644) Grad: 2.7850  
Epoch: [1][17500/51233] Data 0.302 (0.311) Elapsed 155m 29s (remain 299m 41s) Loss: 0.0050(0.0644) Grad: 0.1304  
Epoch: [1][17520/51233] Data 0.317 (0.311) Elapsed 155m 40s (remain 299m 31s) Loss: 0.0973(0.0644) Grad: 1.4523  
Epoch: [1][17540/51233] Data 0.310 (0.311) Elapsed 155m 50s (remain 299m 20s) Loss: 0.1448(0.0644) Grad: 2.3191  
Epoch: [1][17560/51233] Data 0.312 (0.311) Elapsed 156m 1s (remain 299m 9s) Loss: 0.0528(0.0643) Grad: 0.8345  
Epoch: [1][17580/51233] Data 0.310 (0.311) Elapsed 156m 12s (remain 298m 59s) Loss: 0.0103(0.0643) Grad: 0.2710  
Epoch: [1][17600/51233] Data 0.308 (0.311) Elapsed 156m 22s (remain 298m 48s) Loss: 0.2279(0.0643) Grad: 1.8281  
Epoch: [1][17620/51233] Data 0.317 (0.311) Elapsed 156m 33s (remain 298m 37s) Loss: 0.0222(0.0643) Grad: 0.4909  
Epoch: [1][17640/51233] Data 0.318 (0.311) Elapsed 156m 43s (remain 298m 27s) Loss: 0.1654(0.0643) Grad: 1.8888  
Epoch: [1][17660/51233] Data 0.309 (0.311) Elapsed 156m 54s (remain 298m 16s) Loss: 0.0130(0.0642) Grad: 0.2430  
Epoch: [1][17680/51233] Data 0.310 (0.311) Elapsed 157m 5s (remain 298m 5s) Loss: 0.0293(0.0643) Grad: 0.6093  
Epoch: [1][17700/51233] Data 0.297 (0.311) Elapsed 157m 15s (remain 297m 55s) Loss: 0.0209(0.0643) Grad: 0.3948  
Epoch: [1][17720/51233] Data 0.310 (0.311) Elapsed 157m 26s (remain 297m 44s) Loss: 0.0801(0.0643) Grad: 1.1408  
Epoch: [1][17740/51233] Data 0.310 (0.311) Elapsed 157m 37s (remain 297m 33s) Loss: 0.0083(0.0643) Grad: 0.1510  
Epoch: [1][17760/51233] Data 0.317 (0.311) Elapsed 157m 47s (remain 297m 23s) Loss: 0.0694(0.0643) Grad: 0.7943  
Epoch: [1][17780/51233] Data 0.309 (0.311) Elapsed 157m 58s (remain 297m 12s) Loss: 0.0020(0.0643) Grad: 0.0303  
Epoch: [1][17800/51233] Data 0.318 (0.311) Elapsed 158m 9s (remain 297m 1s) Loss: 0.0306(0.0643) Grad: 0.5031  
Epoch: [1][17820/51233] Data 0.302 (0.311) Elapsed 158m 19s (remain 296m 51s) Loss: 0.0531(0.0643) Grad: 1.2269  
Epoch: [1][17840/51233] Data 0.318 (0.311) Elapsed 158m 30s (remain 296m 40s) Loss: 0.0041(0.0643) Grad: 0.0574  
Epoch: [1][17860/51233] Data 0.318 (0.311) Elapsed 158m 41s (remain 296m 29s) Loss: 0.0465(0.0643) Grad: 0.6423  
Epoch: [1][17880/51233] Data 0.304 (0.311) Elapsed 158m 51s (remain 296m 19s) Loss: 0.0036(0.0642) Grad: 0.0489  
Epoch: [1][17900/51233] Data 0.319 (0.311) Elapsed 159m 2s (remain 296m 8s) Loss: 0.0232(0.0642) Grad: 0.6477  
Epoch: [1][17920/51233] Data 0.308 (0.311) Elapsed 159m 13s (remain 295m 57s) Loss: 0.0065(0.0642) Grad: 0.1583  
Epoch: [1][17940/51233] Data 0.317 (0.311) Elapsed 159m 23s (remain 295m 47s) Loss: 0.0655(0.0642) Grad: 0.9800  
Epoch: [1][17960/51233] Data 0.310 (0.311) Elapsed 159m 34s (remain 295m 36s) Loss: 0.0060(0.0642) Grad: 0.1416  
Epoch: [1][17980/51233] Data 0.317 (0.311) Elapsed 159m 45s (remain 295m 25s) Loss: 0.0012(0.0642) Grad: 0.0141  
Epoch: [1][18000/51233] Data 0.317 (0.311) Elapsed 159m 55s (remain 295m 15s) Loss: 0.1338(0.0642) Grad: 1.7041  
Epoch: [1][18020/51233] Data 0.308 (0.311) Elapsed 160m 6s (remain 295m 4s) Loss: 0.0593(0.0642) Grad: 1.4420  
Epoch: [1][18040/51233] Data 0.317 (0.311) Elapsed 160m 17s (remain 294m 53s) Loss: 0.0075(0.0641) Grad: 0.1185  
Epoch: [1][18060/51233] Data 0.318 (0.311) Elapsed 160m 27s (remain 294m 43s) Loss: 0.0719(0.0641) Grad: 0.7436  
Epoch: [1][18080/51233] Data 0.319 (0.311) Elapsed 160m 38s (remain 294m 32s) Loss: 0.0296(0.0642) Grad: 0.7764  
Epoch: [1][18100/51233] Data 0.317 (0.311) Elapsed 160m 49s (remain 294m 21s) Loss: 0.0020(0.0641) Grad: 0.0504  
Epoch: [1][18120/51233] Data 0.317 (0.311) Elapsed 160m 59s (remain 294m 11s) Loss: 0.0403(0.0641) Grad: 0.5903  
Epoch: [1][18140/51233] Data 0.317 (0.311) Elapsed 161m 10s (remain 294m 0s) Loss: 0.1116(0.0641) Grad: 1.6113  
Epoch: [1][18160/51233] Data 0.308 (0.311) Elapsed 161m 21s (remain 293m 49s) Loss: 0.0169(0.0641) Grad: 0.3972  
Epoch: [1][18180/51233] Data 0.308 (0.311) Elapsed 161m 31s (remain 293m 39s) Loss: 0.0129(0.0640) Grad: 0.3302  
Epoch: [1][18200/51233] Data 0.308 (0.311) Elapsed 161m 42s (remain 293m 28s) Loss: 0.0021(0.0640) Grad: 0.0382  
Epoch: [1][18220/51233] Data 0.306 (0.311) Elapsed 161m 53s (remain 293m 17s) Loss: 0.0857(0.0640) Grad: 0.8323  
Epoch: [1][18240/51233] Data 0.304 (0.311) Elapsed 162m 3s (remain 293m 7s) Loss: 0.0289(0.0640) Grad: 0.2730  
Epoch: [1][18260/51233] Data 0.317 (0.311) Elapsed 162m 14s (remain 292m 56s) Loss: 0.1502(0.0640) Grad: 0.8188  
Epoch: [1][18280/51233] Data 0.312 (0.311) Elapsed 162m 25s (remain 292m 45s) Loss: 0.0026(0.0640) Grad: 0.0412  
Epoch: [1][18300/51233] Data 0.316 (0.311) Elapsed 162m 35s (remain 292m 35s) Loss: 0.0009(0.0641) Grad: 0.0155  
Epoch: [1][18320/51233] Data 0.308 (0.311) Elapsed 162m 46s (remain 292m 24s) Loss: 0.0180(0.0641) Grad: 0.5275  
Epoch: [1][18340/51233] Data 0.297 (0.311) Elapsed 162m 57s (remain 292m 13s) Loss: 0.0546(0.0641) Grad: 0.8868  
Epoch: [1][18360/51233] Data 0.319 (0.311) Elapsed 163m 7s (remain 292m 3s) Loss: 0.0122(0.0641) Grad: 0.2770  
Epoch: [1][18380/51233] Data 0.306 (0.311) Elapsed 163m 18s (remain 291m 52s) Loss: 0.0119(0.0641) Grad: 0.1593  
Epoch: [1][18400/51233] Data 0.308 (0.311) Elapsed 163m 29s (remain 291m 41s) Loss: 0.0030(0.0641) Grad: 0.0361  
Epoch: [1][18420/51233] Data 0.317 (0.311) Elapsed 163m 39s (remain 291m 31s) Loss: 0.0096(0.0641) Grad: 0.1353  
Epoch: [1][18440/51233] Data 0.314 (0.311) Elapsed 163m 50s (remain 291m 20s) Loss: 0.0337(0.0641) Grad: 0.3542  
Epoch: [1][18460/51233] Data 0.311 (0.311) Elapsed 164m 1s (remain 291m 10s) Loss: 0.0076(0.0641) Grad: 0.1427  
Epoch: [1][18480/51233] Data 0.296 (0.311) Elapsed 164m 11s (remain 290m 59s) Loss: 0.0408(0.0641) Grad: 1.0354  
Epoch: [1][18500/51233] Data 0.317 (0.311) Elapsed 164m 22s (remain 290m 48s) Loss: 0.0426(0.0641) Grad: 0.5924  
Epoch: [1][18520/51233] Data 0.309 (0.311) Elapsed 164m 33s (remain 290m 38s) Loss: 0.0942(0.0641) Grad: 1.7498  
Epoch: [1][18540/51233] Data 0.316 (0.311) Elapsed 164m 43s (remain 290m 27s) Loss: 0.0431(0.0641) Grad: 0.7065  
Epoch: [1][18560/51233] Data 0.318 (0.311) Elapsed 164m 54s (remain 290m 16s) Loss: 0.0103(0.0641) Grad: 0.1902  
Epoch: [1][18580/51233] Data 0.311 (0.311) Elapsed 165m 5s (remain 290m 6s) Loss: 0.0854(0.0641) Grad: 0.9527  
Epoch: [1][18600/51233] Data 0.309 (0.311) Elapsed 165m 15s (remain 289m 55s) Loss: 0.0241(0.0641) Grad: 0.4693  
Epoch: [1][18620/51233] Data 0.308 (0.311) Elapsed 165m 26s (remain 289m 45s) Loss: 0.0054(0.0641) Grad: 0.0772  
Epoch: [1][18640/51233] Data 0.309 (0.311) Elapsed 165m 37s (remain 289m 34s) Loss: 0.0135(0.0641) Grad: 0.2461  
Epoch: [1][18660/51233] Data 0.309 (0.311) Elapsed 165m 48s (remain 289m 23s) Loss: 0.0118(0.0640) Grad: 0.1526  
Epoch: [1][18680/51233] Data 0.309 (0.311) Elapsed 165m 58s (remain 289m 13s) Loss: 0.0356(0.0640) Grad: 0.5414  
Epoch: [1][18700/51233] Data 0.318 (0.311) Elapsed 166m 9s (remain 289m 2s) Loss: 0.0034(0.0640) Grad: 0.0519  
Epoch: [1][18720/51233] Data 0.297 (0.311) Elapsed 166m 20s (remain 288m 52s) Loss: 0.0545(0.0640) Grad: 0.7804  
Epoch: [1][18740/51233] Data 0.306 (0.311) Elapsed 166m 30s (remain 288m 41s) Loss: 0.0006(0.0640) Grad: 0.0136  
Epoch: [1][18760/51233] Data 0.318 (0.311) Elapsed 166m 41s (remain 288m 30s) Loss: 0.0510(0.0639) Grad: 0.8115  
Epoch: [1][18780/51233] Data 0.308 (0.311) Elapsed 166m 52s (remain 288m 20s) Loss: 0.0032(0.0639) Grad: 0.0750  
Epoch: [1][18800/51233] Data 0.307 (0.311) Elapsed 167m 2s (remain 288m 9s) Loss: 0.0046(0.0639) Grad: 0.1301  
Epoch: [1][18820/51233] Data 0.306 (0.311) Elapsed 167m 13s (remain 287m 58s) Loss: 0.0120(0.0639) Grad: 0.2474  
Epoch: [1][18840/51233] Data 0.310 (0.311) Elapsed 167m 24s (remain 287m 48s) Loss: 0.1431(0.0639) Grad: 1.6050  
Epoch: [1][18860/51233] Data 0.293 (0.311) Elapsed 167m 34s (remain 287m 37s) Loss: 0.0400(0.0640) Grad: 0.9010  
Epoch: [1][18880/51233] Data 0.298 (0.311) Elapsed 167m 45s (remain 287m 27s) Loss: 0.0583(0.0639) Grad: 0.8443  
Epoch: [1][18900/51233] Data 0.310 (0.311) Elapsed 167m 56s (remain 287m 16s) Loss: 0.3709(0.0639) Grad: 3.5908  
Epoch: [1][18920/51233] Data 0.294 (0.311) Elapsed 168m 6s (remain 287m 5s) Loss: 0.0263(0.0640) Grad: 0.4897  
Epoch: [1][18940/51233] Data 0.317 (0.311) Elapsed 168m 17s (remain 286m 55s) Loss: 0.1638(0.0639) Grad: 1.0069  
Epoch: [1][18960/51233] Data 0.302 (0.311) Elapsed 168m 28s (remain 286m 44s) Loss: 0.0536(0.0640) Grad: 0.5780  
Epoch: [1][18980/51233] Data 0.316 (0.311) Elapsed 168m 39s (remain 286m 33s) Loss: 0.1613(0.0640) Grad: 1.1038  
Epoch: [1][19000/51233] Data 0.308 (0.311) Elapsed 168m 49s (remain 286m 23s) Loss: 0.0954(0.0639) Grad: 1.3066  
Epoch: [1][19020/51233] Data 0.310 (0.311) Elapsed 169m 0s (remain 286m 12s) Loss: 0.0028(0.0639) Grad: 0.0377  
Epoch: [1][19040/51233] Data 0.313 (0.311) Elapsed 169m 11s (remain 286m 2s) Loss: 0.0195(0.0639) Grad: 0.3481  
Epoch: [1][19060/51233] Data 0.308 (0.311) Elapsed 169m 21s (remain 285m 51s) Loss: 0.0695(0.0639) Grad: 1.3701  
Epoch: [1][19080/51233] Data 0.304 (0.311) Elapsed 169m 32s (remain 285m 40s) Loss: 0.0067(0.0638) Grad: 0.1682  
Epoch: [1][19100/51233] Data 0.306 (0.311) Elapsed 169m 43s (remain 285m 30s) Loss: 0.0701(0.0638) Grad: 1.5376  
Epoch: [1][19120/51233] Data 0.315 (0.311) Elapsed 169m 53s (remain 285m 19s) Loss: 0.0043(0.0638) Grad: 0.0376  
Epoch: [1][19140/51233] Data 0.309 (0.311) Elapsed 170m 4s (remain 285m 8s) Loss: 0.0052(0.0638) Grad: 0.0933  
Epoch: [1][19160/51233] Data 0.297 (0.311) Elapsed 170m 15s (remain 284m 58s) Loss: 0.0394(0.0638) Grad: 0.8097  
Epoch: [1][19180/51233] Data 0.309 (0.311) Elapsed 170m 25s (remain 284m 47s) Loss: 0.0087(0.0638) Grad: 0.1614  
Epoch: [1][19200/51233] Data 0.307 (0.311) Elapsed 170m 36s (remain 284m 37s) Loss: 0.0417(0.0638) Grad: 0.5186  
Epoch: [1][19220/51233] Data 0.316 (0.311) Elapsed 170m 47s (remain 284m 26s) Loss: 0.0144(0.0638) Grad: 0.2952  
Epoch: [1][19240/51233] Data 0.317 (0.311) Elapsed 170m 57s (remain 284m 15s) Loss: 0.0340(0.0639) Grad: 0.3775  
Epoch: [1][19260/51233] Data 0.304 (0.311) Elapsed 171m 8s (remain 284m 5s) Loss: 0.0438(0.0639) Grad: 0.9051  
Epoch: [1][19280/51233] Data 0.306 (0.311) Elapsed 171m 19s (remain 283m 54s) Loss: 0.0003(0.0639) Grad: 0.0027  
Epoch: [1][19300/51233] Data 0.318 (0.311) Elapsed 171m 30s (remain 283m 44s) Loss: 0.0022(0.0639) Grad: 0.0208  
Epoch: [1][19320/51233] Data 0.310 (0.311) Elapsed 171m 40s (remain 283m 33s) Loss: 0.0120(0.0639) Grad: 0.1958  
Epoch: [1][19340/51233] Data 0.295 (0.311) Elapsed 171m 51s (remain 283m 22s) Loss: 0.0019(0.0638) Grad: 0.0219  
Epoch: [1][19360/51233] Data 0.306 (0.311) Elapsed 172m 2s (remain 283m 12s) Loss: 0.0083(0.0638) Grad: 0.0875  
Epoch: [1][19380/51233] Data 0.306 (0.311) Elapsed 172m 12s (remain 283m 1s) Loss: 0.0618(0.0638) Grad: 1.0056  
Epoch: [1][19400/51233] Data 0.308 (0.311) Elapsed 172m 23s (remain 282m 50s) Loss: 0.0389(0.0638) Grad: 0.9578  
Epoch: [1][19420/51233] Data 0.307 (0.311) Elapsed 172m 34s (remain 282m 40s) Loss: 0.1267(0.0638) Grad: 2.1682  
Epoch: [1][19440/51233] Data 0.308 (0.311) Elapsed 172m 44s (remain 282m 29s) Loss: 0.1487(0.0638) Grad: 1.8692  
Epoch: [1][19460/51233] Data 0.316 (0.311) Elapsed 172m 55s (remain 282m 19s) Loss: 0.0115(0.0638) Grad: 0.1834  
Epoch: [1][19480/51233] Data 0.317 (0.311) Elapsed 173m 6s (remain 282m 8s) Loss: 0.0122(0.0638) Grad: 0.2659  
Epoch: [1][19500/51233] Data 0.310 (0.311) Elapsed 173m 16s (remain 281m 57s) Loss: 0.0102(0.0638) Grad: 0.1478  
Epoch: [1][19520/51233] Data 0.318 (0.311) Elapsed 173m 27s (remain 281m 47s) Loss: 0.0057(0.0638) Grad: 0.0910  
Epoch: [1][19540/51233] Data 0.304 (0.311) Elapsed 173m 38s (remain 281m 36s) Loss: 0.2244(0.0638) Grad: 1.4823  
Epoch: [1][19560/51233] Data 0.300 (0.311) Elapsed 173m 48s (remain 281m 25s) Loss: 0.0335(0.0638) Grad: 0.3677  
Epoch: [1][19580/51233] Data 0.314 (0.311) Elapsed 173m 59s (remain 281m 15s) Loss: 0.0396(0.0638) Grad: 0.9980  
Epoch: [1][19600/51233] Data 0.308 (0.311) Elapsed 174m 10s (remain 281m 4s) Loss: 0.0206(0.0637) Grad: 0.3527  
Epoch: [1][19620/51233] Data 0.307 (0.311) Elapsed 174m 20s (remain 280m 53s) Loss: 0.0115(0.0637) Grad: 0.2156  
Epoch: [1][19640/51233] Data 0.308 (0.311) Elapsed 174m 31s (remain 280m 43s) Loss: 0.0030(0.0637) Grad: 0.0393  
Epoch: [1][19660/51233] Data 0.318 (0.311) Elapsed 174m 42s (remain 280m 32s) Loss: 0.0015(0.0637) Grad: 0.0189  
Epoch: [1][19680/51233] Data 0.309 (0.311) Elapsed 174m 53s (remain 280m 22s) Loss: 0.1588(0.0637) Grad: 0.6781  
Epoch: [1][19700/51233] Data 0.313 (0.311) Elapsed 175m 3s (remain 280m 11s) Loss: 0.0170(0.0637) Grad: 0.2136  
Epoch: [1][19720/51233] Data 0.309 (0.311) Elapsed 175m 14s (remain 280m 0s) Loss: 0.0116(0.0637) Grad: 0.2007  
Epoch: [1][19740/51233] Data 0.308 (0.311) Elapsed 175m 25s (remain 279m 50s) Loss: 0.2025(0.0637) Grad: 1.4774  
Epoch: [1][19760/51233] Data 0.299 (0.311) Elapsed 175m 35s (remain 279m 39s) Loss: 0.0134(0.0637) Grad: 0.3461  
Epoch: [1][19780/51233] Data 0.310 (0.311) Elapsed 175m 46s (remain 279m 28s) Loss: 0.0400(0.0637) Grad: 0.5620  
Epoch: [1][19800/51233] Data 0.305 (0.311) Elapsed 175m 57s (remain 279m 18s) Loss: 0.0227(0.0637) Grad: 0.5473  
Epoch: [1][19820/51233] Data 0.309 (0.311) Elapsed 176m 7s (remain 279m 7s) Loss: 0.0011(0.0637) Grad: 0.0180  
Epoch: [1][19840/51233] Data 0.308 (0.311) Elapsed 176m 18s (remain 278m 57s) Loss: 0.0514(0.0637) Grad: 0.4923  
Epoch: [1][19860/51233] Data 0.317 (0.311) Elapsed 176m 29s (remain 278m 46s) Loss: 0.2941(0.0637) Grad: 2.6344  
Epoch: [1][19880/51233] Data 0.316 (0.311) Elapsed 176m 39s (remain 278m 35s) Loss: 0.0061(0.0637) Grad: 0.0689  
Epoch: [1][19900/51233] Data 0.308 (0.311) Elapsed 176m 50s (remain 278m 25s) Loss: 0.2204(0.0637) Grad: 1.4711  
Epoch: [1][19920/51233] Data 0.308 (0.311) Elapsed 177m 1s (remain 278m 14s) Loss: 0.0843(0.0637) Grad: 1.7916  
Epoch: [1][19940/51233] Data 0.308 (0.311) Elapsed 177m 11s (remain 278m 3s) Loss: 0.0259(0.0637) Grad: 0.3429  
Epoch: [1][19960/51233] Data 0.318 (0.311) Elapsed 177m 22s (remain 277m 53s) Loss: 0.0491(0.0637) Grad: 0.8963  
Epoch: [1][19980/51233] Data 0.309 (0.311) Elapsed 177m 33s (remain 277m 42s) Loss: 0.0152(0.0637) Grad: 0.2231  
Epoch: [1][20000/51233] Data 0.316 (0.311) Elapsed 177m 43s (remain 277m 32s) Loss: 0.0091(0.0637) Grad: 0.1703  
Epoch: [1][20020/51233] Data 0.306 (0.311) Elapsed 177m 54s (remain 277m 21s) Loss: 0.0980(0.0637) Grad: 1.2604  
Epoch: [1][20040/51233] Data 0.314 (0.311) Elapsed 178m 5s (remain 277m 10s) Loss: 0.1282(0.0637) Grad: 0.6889  
Epoch: [1][20060/51233] Data 0.306 (0.311) Elapsed 178m 16s (remain 277m 0s) Loss: 0.0120(0.0636) Grad: 0.3427  
Epoch: [1][20080/51233] Data 0.309 (0.311) Elapsed 178m 26s (remain 276m 49s) Loss: 0.0075(0.0636) Grad: 0.0955  
Epoch: [1][20100/51233] Data 0.308 (0.311) Elapsed 178m 37s (remain 276m 38s) Loss: 0.0051(0.0636) Grad: 0.0525  
Epoch: [1][20120/51233] Data 0.318 (0.311) Elapsed 178m 48s (remain 276m 28s) Loss: 0.1053(0.0636) Grad: 1.4339  
Epoch: [1][20140/51233] Data 0.317 (0.311) Elapsed 178m 58s (remain 276m 17s) Loss: 0.0933(0.0636) Grad: 0.6165  
Epoch: [1][20160/51233] Data 0.310 (0.311) Elapsed 179m 9s (remain 276m 7s) Loss: 0.0040(0.0635) Grad: 0.0427  
Epoch: [1][20180/51233] Data 0.316 (0.311) Elapsed 179m 20s (remain 275m 56s) Loss: 0.0038(0.0636) Grad: 0.0906  
Epoch: [1][20200/51233] Data 0.294 (0.311) Elapsed 179m 30s (remain 275m 45s) Loss: 0.1227(0.0635) Grad: 1.5726  
Epoch: [1][20220/51233] Data 0.317 (0.311) Elapsed 179m 41s (remain 275m 35s) Loss: 0.0841(0.0635) Grad: 1.1021  
Epoch: [1][20240/51233] Data 0.305 (0.311) Elapsed 179m 52s (remain 275m 24s) Loss: 0.6160(0.0635) Grad: 2.0515  
Epoch: [1][20260/51233] Data 0.309 (0.311) Elapsed 180m 2s (remain 275m 13s) Loss: 0.0566(0.0635) Grad: 0.7074  
Epoch: [1][20280/51233] Data 0.307 (0.311) Elapsed 180m 13s (remain 275m 3s) Loss: 0.0169(0.0635) Grad: 0.3322  
Epoch: [1][20300/51233] Data 0.307 (0.311) Elapsed 180m 24s (remain 274m 52s) Loss: 0.0256(0.0635) Grad: 0.5483  
Epoch: [1][20320/51233] Data 0.304 (0.311) Elapsed 180m 34s (remain 274m 42s) Loss: 0.0262(0.0635) Grad: 0.5607  
Epoch: [1][20340/51233] Data 0.310 (0.311) Elapsed 180m 45s (remain 274m 31s) Loss: 0.0160(0.0634) Grad: 0.3812  
Epoch: [1][20360/51233] Data 0.318 (0.311) Elapsed 180m 56s (remain 274m 20s) Loss: 0.0444(0.0635) Grad: 1.0521  
Epoch: [1][20380/51233] Data 0.308 (0.311) Elapsed 181m 7s (remain 274m 10s) Loss: 0.0164(0.0634) Grad: 0.1746  
Epoch: [1][20400/51233] Data 0.310 (0.311) Elapsed 181m 17s (remain 273m 59s) Loss: 0.0545(0.0634) Grad: 0.6662  
Epoch: [1][20420/51233] Data 0.317 (0.311) Elapsed 181m 28s (remain 273m 48s) Loss: 0.0194(0.0634) Grad: 0.2579  
Epoch: [1][20440/51233] Data 0.308 (0.311) Elapsed 181m 39s (remain 273m 38s) Loss: 0.0666(0.0634) Grad: 1.5072  
Epoch: [1][20460/51233] Data 0.309 (0.311) Elapsed 181m 49s (remain 273m 27s) Loss: 0.0089(0.0634) Grad: 0.1268  
Epoch: [1][20480/51233] Data 0.306 (0.311) Elapsed 182m 0s (remain 273m 17s) Loss: 0.0027(0.0634) Grad: 0.0466  
Epoch: [1][20500/51233] Data 0.310 (0.311) Elapsed 182m 11s (remain 273m 6s) Loss: 0.0066(0.0634) Grad: 0.1079  
Epoch: [1][20520/51233] Data 0.295 (0.311) Elapsed 182m 21s (remain 272m 55s) Loss: 0.1118(0.0634) Grad: 1.0439  
Epoch: [1][20540/51233] Data 0.317 (0.311) Elapsed 182m 32s (remain 272m 45s) Loss: 0.0107(0.0634) Grad: 0.1693  
Epoch: [1][20560/51233] Data 0.308 (0.311) Elapsed 182m 43s (remain 272m 34s) Loss: 0.0053(0.0634) Grad: 0.1519  
Epoch: [1][20580/51233] Data 0.288 (0.311) Elapsed 182m 53s (remain 272m 23s) Loss: 0.0536(0.0633) Grad: 0.8118  
Epoch: [1][20600/51233] Data 0.310 (0.311) Elapsed 183m 4s (remain 272m 13s) Loss: 0.0140(0.0634) Grad: 0.1760  
Epoch: [1][20620/51233] Data 0.310 (0.311) Elapsed 183m 15s (remain 272m 2s) Loss: 0.0777(0.0633) Grad: 0.7517  
Epoch: [1][20640/51233] Data 0.306 (0.311) Elapsed 183m 25s (remain 271m 51s) Loss: 0.0063(0.0633) Grad: 0.1018  
Epoch: [1][20660/51233] Data 0.306 (0.311) Elapsed 183m 36s (remain 271m 41s) Loss: 0.0021(0.0633) Grad: 0.0397  
Epoch: [1][20680/51233] Data 0.313 (0.311) Elapsed 183m 47s (remain 271m 30s) Loss: 0.0086(0.0633) Grad: 0.1497  
Epoch: [1][20700/51233] Data 0.294 (0.311) Elapsed 183m 58s (remain 271m 20s) Loss: 0.0165(0.0633) Grad: 0.2000  
Epoch: [1][20720/51233] Data 0.310 (0.311) Elapsed 184m 8s (remain 271m 9s) Loss: 0.0088(0.0633) Grad: 0.1863  
Epoch: [1][20740/51233] Data 0.318 (0.311) Elapsed 184m 19s (remain 270m 58s) Loss: 0.0014(0.0633) Grad: 0.0171  
Epoch: [1][20760/51233] Data 0.308 (0.311) Elapsed 184m 30s (remain 270m 48s) Loss: 0.0283(0.0633) Grad: 0.8098  
Epoch: [1][20780/51233] Data 0.308 (0.311) Elapsed 184m 40s (remain 270m 37s) Loss: 0.0141(0.0633) Grad: 0.2457  
Epoch: [1][20800/51233] Data 0.303 (0.311) Elapsed 184m 51s (remain 270m 26s) Loss: 0.0317(0.0633) Grad: 0.4713  
Epoch: [1][20820/51233] Data 0.308 (0.311) Elapsed 185m 2s (remain 270m 16s) Loss: 0.1958(0.0633) Grad: 1.7826  
Epoch: [1][20840/51233] Data 0.316 (0.311) Elapsed 185m 12s (remain 270m 5s) Loss: 0.0050(0.0633) Grad: 0.0616  
Epoch: [1][20860/51233] Data 0.311 (0.311) Elapsed 185m 23s (remain 269m 55s) Loss: 0.0764(0.0633) Grad: 1.1538  
Epoch: [1][20880/51233] Data 0.299 (0.311) Elapsed 185m 34s (remain 269m 44s) Loss: 0.0133(0.0633) Grad: 0.2162  
Epoch: [1][20900/51233] Data 0.312 (0.311) Elapsed 185m 44s (remain 269m 33s) Loss: 0.0027(0.0632) Grad: 0.0858  
Epoch: [1][20920/51233] Data 0.307 (0.311) Elapsed 185m 55s (remain 269m 23s) Loss: 0.1728(0.0632) Grad: 1.2307  
Epoch: [1][20940/51233] Data 0.307 (0.311) Elapsed 186m 6s (remain 269m 12s) Loss: 0.4286(0.0633) Grad: 1.7344  
Epoch: [1][20960/51233] Data 0.303 (0.311) Elapsed 186m 16s (remain 269m 1s) Loss: 0.0115(0.0633) Grad: 0.1229  
Epoch: [1][20980/51233] Data 0.315 (0.311) Elapsed 186m 27s (remain 268m 51s) Loss: 0.0055(0.0633) Grad: 0.1123  
Epoch: [1][21000/51233] Data 0.317 (0.311) Elapsed 186m 38s (remain 268m 40s) Loss: 0.0024(0.0632) Grad: 0.0269  
Epoch: [1][21020/51233] Data 0.314 (0.311) Elapsed 186m 49s (remain 268m 29s) Loss: 0.0848(0.0632) Grad: 1.5478  
Epoch: [1][21040/51233] Data 0.314 (0.311) Elapsed 186m 59s (remain 268m 19s) Loss: 0.0035(0.0632) Grad: 0.1040  
Epoch: [1][21060/51233] Data 0.318 (0.311) Elapsed 187m 10s (remain 268m 8s) Loss: 0.0140(0.0632) Grad: 0.3250  
Epoch: [1][21080/51233] Data 0.319 (0.311) Elapsed 187m 21s (remain 267m 58s) Loss: 0.0455(0.0632) Grad: 0.7710  
Epoch: [1][21100/51233] Data 0.301 (0.311) Elapsed 187m 31s (remain 267m 47s) Loss: 0.1389(0.0632) Grad: 1.8059  
Epoch: [1][21120/51233] Data 0.316 (0.311) Elapsed 187m 42s (remain 267m 36s) Loss: 0.0428(0.0632) Grad: 0.6226  
Epoch: [1][21140/51233] Data 0.317 (0.311) Elapsed 187m 53s (remain 267m 26s) Loss: 0.3684(0.0632) Grad: 2.1296  
Epoch: [1][21160/51233] Data 0.318 (0.311) Elapsed 188m 3s (remain 267m 15s) Loss: 0.0026(0.0632) Grad: 0.0346  
Epoch: [1][21180/51233] Data 0.305 (0.311) Elapsed 188m 14s (remain 267m 4s) Loss: 0.0009(0.0633) Grad: 0.0122  
Epoch: [1][21200/51233] Data 0.309 (0.311) Elapsed 188m 25s (remain 266m 54s) Loss: 0.1309(0.0633) Grad: 1.5823  
Epoch: [1][21220/51233] Data 0.310 (0.311) Elapsed 188m 35s (remain 266m 43s) Loss: 0.1603(0.0633) Grad: 1.9197  
Epoch: [1][21240/51233] Data 0.308 (0.311) Elapsed 188m 46s (remain 266m 32s) Loss: 0.0038(0.0632) Grad: 0.0873  
Epoch: [1][21260/51233] Data 0.308 (0.311) Elapsed 188m 57s (remain 266m 22s) Loss: 0.0049(0.0632) Grad: 0.0993  
Epoch: [1][21280/51233] Data 0.311 (0.311) Elapsed 189m 7s (remain 266m 11s) Loss: 0.0505(0.0632) Grad: 0.5069  
Epoch: [1][21300/51233] Data 0.306 (0.311) Elapsed 189m 18s (remain 266m 1s) Loss: 0.0173(0.0632) Grad: 0.4588  
Epoch: [1][21320/51233] Data 0.306 (0.311) Elapsed 189m 29s (remain 265m 50s) Loss: 0.0110(0.0631) Grad: 0.3700  
Epoch: [1][21340/51233] Data 0.315 (0.311) Elapsed 189m 40s (remain 265m 39s) Loss: 0.0090(0.0631) Grad: 0.2756  
Epoch: [1][21360/51233] Data 0.318 (0.311) Elapsed 189m 50s (remain 265m 29s) Loss: 0.1804(0.0631) Grad: 1.6921  
Epoch: [1][21380/51233] Data 0.305 (0.311) Elapsed 190m 1s (remain 265m 18s) Loss: 0.0192(0.0632) Grad: 0.3682  
Epoch: [1][21400/51233] Data 0.305 (0.311) Elapsed 190m 12s (remain 265m 7s) Loss: 0.0688(0.0632) Grad: 0.9838  
Epoch: [1][21420/51233] Data 0.319 (0.311) Elapsed 190m 22s (remain 264m 57s) Loss: 0.0023(0.0632) Grad: 0.0408  
Epoch: [1][21440/51233] Data 0.318 (0.311) Elapsed 190m 33s (remain 264m 46s) Loss: 0.0414(0.0632) Grad: 0.4793  
Epoch: [1][21460/51233] Data 0.307 (0.311) Elapsed 190m 44s (remain 264m 35s) Loss: 0.1558(0.0632) Grad: 1.7489  
Epoch: [1][21480/51233] Data 0.283 (0.311) Elapsed 190m 54s (remain 264m 25s) Loss: 0.0951(0.0632) Grad: 1.2155  
Epoch: [1][21500/51233] Data 0.317 (0.311) Elapsed 191m 5s (remain 264m 14s) Loss: 0.4829(0.0632) Grad: 1.9953  
Epoch: [1][21520/51233] Data 0.303 (0.311) Elapsed 191m 16s (remain 264m 4s) Loss: 0.0987(0.0632) Grad: 1.3644  
Epoch: [1][21540/51233] Data 0.309 (0.311) Elapsed 191m 26s (remain 263m 53s) Loss: 0.0038(0.0632) Grad: 0.0735  
Epoch: [1][21560/51233] Data 0.318 (0.311) Elapsed 191m 37s (remain 263m 42s) Loss: 0.0040(0.0631) Grad: 0.0526  
Epoch: [1][21580/51233] Data 0.306 (0.311) Elapsed 191m 48s (remain 263m 32s) Loss: 0.0010(0.0631) Grad: 0.0146  
Epoch: [1][21600/51233] Data 0.305 (0.311) Elapsed 191m 58s (remain 263m 21s) Loss: 0.0310(0.0631) Grad: 0.3294  
Epoch: [1][21620/51233] Data 0.309 (0.311) Elapsed 192m 9s (remain 263m 10s) Loss: 0.0161(0.0631) Grad: 0.2712  
Epoch: [1][21640/51233] Data 0.310 (0.311) Elapsed 192m 20s (remain 263m 0s) Loss: 0.0664(0.0631) Grad: 1.0951  
Epoch: [1][21660/51233] Data 0.300 (0.311) Elapsed 192m 31s (remain 262m 49s) Loss: 0.2247(0.0631) Grad: 2.2475  
Epoch: [1][21680/51233] Data 0.317 (0.311) Elapsed 192m 41s (remain 262m 39s) Loss: 0.0019(0.0631) Grad: 0.0289  
Epoch: [1][21700/51233] Data 0.308 (0.311) Elapsed 192m 52s (remain 262m 28s) Loss: 0.0122(0.0630) Grad: 0.2233  
Epoch: [1][21720/51233] Data 0.310 (0.311) Elapsed 193m 3s (remain 262m 17s) Loss: 0.2777(0.0631) Grad: 2.3326  
Epoch: [1][21740/51233] Data 0.308 (0.311) Elapsed 193m 13s (remain 262m 7s) Loss: 0.0058(0.0631) Grad: 0.0995  
Epoch: [1][21760/51233] Data 0.318 (0.311) Elapsed 193m 24s (remain 261m 56s) Loss: 0.0129(0.0630) Grad: 0.1372  
Epoch: [1][21780/51233] Data 0.313 (0.311) Elapsed 193m 35s (remain 261m 45s) Loss: 0.0246(0.0631) Grad: 0.5340  
Epoch: [1][21800/51233] Data 0.308 (0.311) Elapsed 193m 45s (remain 261m 35s) Loss: 0.0146(0.0630) Grad: 0.3243  
Epoch: [1][21820/51233] Data 0.309 (0.311) Elapsed 193m 56s (remain 261m 24s) Loss: 0.3380(0.0630) Grad: 2.5255  
Epoch: [1][21840/51233] Data 0.307 (0.311) Elapsed 194m 7s (remain 261m 13s) Loss: 0.0039(0.0630) Grad: 0.0444  
Epoch: [1][21860/51233] Data 0.309 (0.311) Elapsed 194m 17s (remain 261m 3s) Loss: 0.0369(0.0630) Grad: 0.5981  
Epoch: [1][21880/51233] Data 0.317 (0.311) Elapsed 194m 28s (remain 260m 52s) Loss: 0.0032(0.0630) Grad: 0.0510  
Epoch: [1][21900/51233] Data 0.307 (0.311) Elapsed 194m 39s (remain 260m 42s) Loss: 0.0142(0.0630) Grad: 0.3418  
Epoch: [1][21920/51233] Data 0.307 (0.311) Elapsed 194m 49s (remain 260m 31s) Loss: 0.0027(0.0630) Grad: 0.0434  
Epoch: [1][21940/51233] Data 0.308 (0.311) Elapsed 195m 0s (remain 260m 20s) Loss: 0.0069(0.0629) Grad: 0.1300  
Epoch: [1][21960/51233] Data 0.311 (0.311) Elapsed 195m 11s (remain 260m 10s) Loss: 0.0101(0.0629) Grad: 0.2017  
Epoch: [1][21980/51233] Data 0.308 (0.311) Elapsed 195m 22s (remain 259m 59s) Loss: 0.3334(0.0629) Grad: 1.5335  
Epoch: [1][22000/51233] Data 0.298 (0.311) Elapsed 195m 32s (remain 259m 48s) Loss: 0.0145(0.0629) Grad: 0.3206  
Epoch: [1][22020/51233] Data 0.308 (0.311) Elapsed 195m 43s (remain 259m 38s) Loss: 0.1188(0.0629) Grad: 1.2217  
Epoch: [1][22040/51233] Data 0.316 (0.311) Elapsed 195m 54s (remain 259m 27s) Loss: 0.0071(0.0629) Grad: 0.1913  
Epoch: [1][22060/51233] Data 0.300 (0.311) Elapsed 196m 4s (remain 259m 16s) Loss: 0.0179(0.0629) Grad: 0.3158  
Epoch: [1][22080/51233] Data 0.308 (0.311) Elapsed 196m 15s (remain 259m 6s) Loss: 0.0276(0.0629) Grad: 0.6715  
Epoch: [1][22100/51233] Data 0.316 (0.311) Elapsed 196m 26s (remain 258m 55s) Loss: 0.0037(0.0629) Grad: 0.0720  
Epoch: [1][22120/51233] Data 0.309 (0.311) Elapsed 196m 36s (remain 258m 45s) Loss: 0.0346(0.0629) Grad: 0.4833  
Epoch: [1][22140/51233] Data 0.305 (0.311) Elapsed 196m 47s (remain 258m 34s) Loss: 0.1740(0.0629) Grad: 2.3554  
Epoch: [1][22160/51233] Data 0.308 (0.311) Elapsed 196m 58s (remain 258m 23s) Loss: 0.0270(0.0629) Grad: 0.2902  
Epoch: [1][22180/51233] Data 0.316 (0.311) Elapsed 197m 8s (remain 258m 13s) Loss: 0.0034(0.0629) Grad: 0.0352  
Epoch: [1][22200/51233] Data 0.295 (0.311) Elapsed 197m 19s (remain 258m 2s) Loss: 0.0216(0.0629) Grad: 0.3081  
Epoch: [1][22220/51233] Data 0.316 (0.311) Elapsed 197m 30s (remain 257m 51s) Loss: 0.0070(0.0629) Grad: 0.1241  
Epoch: [1][22240/51233] Data 0.317 (0.311) Elapsed 197m 40s (remain 257m 41s) Loss: 0.0154(0.0628) Grad: 0.3664  
Epoch: [1][22260/51233] Data 0.318 (0.311) Elapsed 197m 51s (remain 257m 30s) Loss: 0.0101(0.0629) Grad: 0.1711  
Epoch: [1][22280/51233] Data 0.314 (0.311) Elapsed 198m 2s (remain 257m 19s) Loss: 0.1290(0.0628) Grad: 1.4577  
Epoch: [1][22300/51233] Data 0.307 (0.311) Elapsed 198m 13s (remain 257m 9s) Loss: 0.0914(0.0628) Grad: 1.4739  
Epoch: [1][22320/51233] Data 0.316 (0.311) Elapsed 198m 23s (remain 256m 58s) Loss: 0.0008(0.0628) Grad: 0.0111  
Epoch: [1][22340/51233] Data 0.317 (0.311) Elapsed 198m 34s (remain 256m 48s) Loss: 0.1631(0.0628) Grad: 1.7325  
Epoch: [1][22360/51233] Data 0.307 (0.311) Elapsed 198m 45s (remain 256m 37s) Loss: 0.2263(0.0628) Grad: 1.6829  
Epoch: [1][22380/51233] Data 0.317 (0.311) Elapsed 198m 55s (remain 256m 26s) Loss: 0.2581(0.0628) Grad: 1.3208  
Epoch: [1][22400/51233] Data 0.318 (0.311) Elapsed 199m 6s (remain 256m 16s) Loss: 0.0066(0.0628) Grad: 0.0890  
Epoch: [1][22420/51233] Data 0.317 (0.311) Elapsed 199m 17s (remain 256m 5s) Loss: 0.0428(0.0628) Grad: 0.9533  
Epoch: [1][22440/51233] Data 0.306 (0.311) Elapsed 199m 27s (remain 255m 54s) Loss: 0.0062(0.0628) Grad: 0.1201  
Epoch: [1][22460/51233] Data 0.309 (0.311) Elapsed 199m 38s (remain 255m 44s) Loss: 0.0106(0.0628) Grad: 0.1768  
Epoch: [1][22480/51233] Data 0.295 (0.311) Elapsed 199m 49s (remain 255m 33s) Loss: 0.0061(0.0628) Grad: 0.1244  
Epoch: [1][22500/51233] Data 0.294 (0.311) Elapsed 199m 59s (remain 255m 22s) Loss: 0.0036(0.0628) Grad: 0.0908  
Epoch: [1][22520/51233] Data 0.312 (0.311) Elapsed 200m 10s (remain 255m 12s) Loss: 0.0013(0.0628) Grad: 0.0263  
Epoch: [1][22540/51233] Data 0.308 (0.311) Elapsed 200m 21s (remain 255m 1s) Loss: 0.0588(0.0628) Grad: 0.9375  
Epoch: [1][22560/51233] Data 0.310 (0.311) Elapsed 200m 31s (remain 254m 50s) Loss: 0.0102(0.0628) Grad: 0.2876  
Epoch: [1][22580/51233] Data 0.312 (0.311) Elapsed 200m 42s (remain 254m 40s) Loss: 0.0438(0.0628) Grad: 0.5700  
Epoch: [1][22600/51233] Data 0.317 (0.311) Elapsed 200m 53s (remain 254m 29s) Loss: 0.0030(0.0627) Grad: 0.0880  
Epoch: [1][22620/51233] Data 0.309 (0.311) Elapsed 201m 4s (remain 254m 19s) Loss: 0.0041(0.0627) Grad: 0.0774  
Epoch: [1][22640/51233] Data 0.307 (0.311) Elapsed 201m 14s (remain 254m 8s) Loss: 0.3268(0.0627) Grad: 2.3135  
Epoch: [1][22660/51233] Data 0.310 (0.311) Elapsed 201m 25s (remain 253m 57s) Loss: 0.0073(0.0627) Grad: 0.1181  
Epoch: [1][22680/51233] Data 0.301 (0.311) Elapsed 201m 36s (remain 253m 47s) Loss: 0.0014(0.0627) Grad: 0.0168  
Epoch: [1][22700/51233] Data 0.307 (0.311) Elapsed 201m 46s (remain 253m 36s) Loss: 0.0112(0.0627) Grad: 0.1881  
Epoch: [1][22720/51233] Data 0.313 (0.311) Elapsed 201m 57s (remain 253m 25s) Loss: 0.2890(0.0627) Grad: 2.0594  
Epoch: [1][22740/51233] Data 0.309 (0.311) Elapsed 202m 8s (remain 253m 15s) Loss: 0.0021(0.0627) Grad: 0.0585  
Epoch: [1][22760/51233] Data 0.317 (0.311) Elapsed 202m 18s (remain 253m 4s) Loss: 0.0102(0.0627) Grad: 0.1229  
Epoch: [1][22780/51233] Data 0.308 (0.311) Elapsed 202m 29s (remain 252m 53s) Loss: 0.0090(0.0627) Grad: 0.1339  
Epoch: [1][22800/51233] Data 0.317 (0.311) Elapsed 202m 40s (remain 252m 43s) Loss: 0.1659(0.0627) Grad: 1.3875  
Epoch: [1][22820/51233] Data 0.308 (0.311) Elapsed 202m 50s (remain 252m 32s) Loss: 0.0335(0.0627) Grad: 0.6265  
Epoch: [1][22840/51233] Data 0.307 (0.311) Elapsed 203m 1s (remain 252m 21s) Loss: 0.1560(0.0627) Grad: 1.3110  
Epoch: [1][22860/51233] Data 0.317 (0.311) Elapsed 203m 12s (remain 252m 11s) Loss: 0.5321(0.0627) Grad: 1.9577  
Epoch: [1][22880/51233] Data 0.308 (0.311) Elapsed 203m 22s (remain 252m 0s) Loss: 0.0066(0.0627) Grad: 0.1084  
Epoch: [1][22900/51233] Data 0.311 (0.311) Elapsed 203m 33s (remain 251m 49s) Loss: 0.0260(0.0627) Grad: 0.3824  
Epoch: [1][22920/51233] Data 0.313 (0.311) Elapsed 203m 44s (remain 251m 39s) Loss: 0.0086(0.0627) Grad: 0.1016  
Epoch: [1][22940/51233] Data 0.304 (0.311) Elapsed 203m 54s (remain 251m 28s) Loss: 0.0242(0.0627) Grad: 0.4839  
Epoch: [1][22960/51233] Data 0.317 (0.311) Elapsed 204m 5s (remain 251m 17s) Loss: 0.3295(0.0626) Grad: 2.8604  
Epoch: [1][22980/51233] Data 0.308 (0.311) Elapsed 204m 16s (remain 251m 7s) Loss: 0.0223(0.0626) Grad: 0.3284  
Epoch: [1][23000/51233] Data 0.310 (0.311) Elapsed 204m 26s (remain 250m 56s) Loss: 0.0348(0.0626) Grad: 0.7472  
Epoch: [1][23020/51233] Data 0.316 (0.311) Elapsed 204m 37s (remain 250m 45s) Loss: 0.0062(0.0626) Grad: 0.1469  
Epoch: [1][23040/51233] Data 0.308 (0.311) Elapsed 204m 48s (remain 250m 35s) Loss: 0.0219(0.0626) Grad: 0.2250  
Epoch: [1][23060/51233] Data 0.309 (0.311) Elapsed 204m 58s (remain 250m 24s) Loss: 0.1245(0.0626) Grad: 2.0762  
Epoch: [1][23080/51233] Data 0.307 (0.311) Elapsed 205m 9s (remain 250m 13s) Loss: 0.0179(0.0626) Grad: 0.4797  
Epoch: [1][23100/51233] Data 0.317 (0.311) Elapsed 205m 20s (remain 250m 3s) Loss: 0.0036(0.0626) Grad: 0.0438  
Epoch: [1][23120/51233] Data 0.309 (0.311) Elapsed 205m 30s (remain 249m 52s) Loss: 0.0569(0.0626) Grad: 0.8675  
Epoch: [1][23140/51233] Data 0.295 (0.311) Elapsed 205m 41s (remain 249m 41s) Loss: 0.1668(0.0626) Grad: 1.9317  
Epoch: [1][23160/51233] Data 0.306 (0.311) Elapsed 205m 52s (remain 249m 31s) Loss: 0.0027(0.0626) Grad: 0.0343  
Epoch: [1][23180/51233] Data 0.309 (0.311) Elapsed 206m 2s (remain 249m 20s) Loss: 0.0110(0.0626) Grad: 0.1943  
Epoch: [1][23200/51233] Data 0.307 (0.311) Elapsed 206m 13s (remain 249m 9s) Loss: 0.0741(0.0626) Grad: 0.7243  
Epoch: [1][23220/51233] Data 0.307 (0.311) Elapsed 206m 24s (remain 248m 59s) Loss: 0.0206(0.0626) Grad: 0.6021  
Epoch: [1][23240/51233] Data 0.308 (0.311) Elapsed 206m 34s (remain 248m 48s) Loss: 0.0030(0.0626) Grad: 0.0937  
Epoch: [1][23260/51233] Data 0.317 (0.311) Elapsed 206m 45s (remain 248m 37s) Loss: 0.1792(0.0626) Grad: 1.6628  
Epoch: [1][23280/51233] Data 0.317 (0.311) Elapsed 206m 56s (remain 248m 27s) Loss: 0.0805(0.0626) Grad: 0.4637  
Epoch: [1][23300/51233] Data 0.317 (0.311) Elapsed 207m 6s (remain 248m 16s) Loss: 0.2223(0.0625) Grad: 1.5502  
Epoch: [1][23320/51233] Data 0.314 (0.311) Elapsed 207m 17s (remain 248m 6s) Loss: 0.0215(0.0626) Grad: 0.3552  
Epoch: [1][23340/51233] Data 0.317 (0.311) Elapsed 207m 28s (remain 247m 55s) Loss: 0.0201(0.0626) Grad: 0.3331  
Epoch: [1][23360/51233] Data 0.308 (0.311) Elapsed 207m 38s (remain 247m 44s) Loss: 0.1325(0.0626) Grad: 1.3256  
Epoch: [1][23380/51233] Data 0.307 (0.311) Elapsed 207m 49s (remain 247m 34s) Loss: 0.1538(0.0626) Grad: 1.6567  
Epoch: [1][23400/51233] Data 0.305 (0.311) Elapsed 208m 0s (remain 247m 23s) Loss: 0.1747(0.0625) Grad: 1.9116  
Epoch: [1][23420/51233] Data 0.304 (0.311) Elapsed 208m 10s (remain 247m 12s) Loss: 0.0908(0.0625) Grad: 1.3558  
Epoch: [1][23440/51233] Data 0.308 (0.311) Elapsed 208m 21s (remain 247m 2s) Loss: 0.0020(0.0625) Grad: 0.0744  
Epoch: [1][23460/51233] Data 0.318 (0.311) Elapsed 208m 32s (remain 246m 51s) Loss: 0.0259(0.0625) Grad: 0.4193  
Epoch: [1][23480/51233] Data 0.306 (0.311) Elapsed 208m 43s (remain 246m 40s) Loss: 0.0017(0.0625) Grad: 0.0284  
Epoch: [1][23500/51233] Data 0.307 (0.311) Elapsed 208m 53s (remain 246m 30s) Loss: 0.0778(0.0625) Grad: 1.4544  
Epoch: [1][23520/51233] Data 0.309 (0.311) Elapsed 209m 4s (remain 246m 19s) Loss: 0.0300(0.0625) Grad: 0.5851  
Epoch: [1][23540/51233] Data 0.312 (0.311) Elapsed 209m 15s (remain 246m 8s) Loss: 0.0362(0.0625) Grad: 0.7308  
Epoch: [1][23560/51233] Data 0.308 (0.311) Elapsed 209m 25s (remain 245m 58s) Loss: 0.0199(0.0625) Grad: 0.2659  
Epoch: [1][23580/51233] Data 0.318 (0.311) Elapsed 209m 36s (remain 245m 47s) Loss: 0.0442(0.0625) Grad: 0.7925  
Epoch: [1][23600/51233] Data 0.317 (0.311) Elapsed 209m 47s (remain 245m 37s) Loss: 0.0565(0.0625) Grad: 1.1890  
Epoch: [1][23620/51233] Data 0.307 (0.311) Elapsed 209m 57s (remain 245m 26s) Loss: 0.0398(0.0625) Grad: 0.3777  
Epoch: [1][23640/51233] Data 0.317 (0.311) Elapsed 210m 8s (remain 245m 15s) Loss: 0.1446(0.0625) Grad: 1.7300  
Epoch: [1][23660/51233] Data 0.318 (0.311) Elapsed 210m 19s (remain 245m 5s) Loss: 0.0638(0.0625) Grad: 1.0186  
Epoch: [1][23680/51233] Data 0.317 (0.311) Elapsed 210m 29s (remain 244m 54s) Loss: 0.0111(0.0625) Grad: 0.1653  
Epoch: [1][23700/51233] Data 0.310 (0.311) Elapsed 210m 40s (remain 244m 43s) Loss: 0.0340(0.0625) Grad: 0.6222  
Epoch: [1][23720/51233] Data 0.307 (0.311) Elapsed 210m 51s (remain 244m 33s) Loss: 0.0127(0.0625) Grad: 0.2402  
Epoch: [1][23740/51233] Data 0.318 (0.311) Elapsed 211m 1s (remain 244m 22s) Loss: 0.0198(0.0625) Grad: 0.1909  
Epoch: [1][23760/51233] Data 0.302 (0.311) Elapsed 211m 12s (remain 244m 11s) Loss: 0.0036(0.0625) Grad: 0.0654  
Epoch: [1][23780/51233] Data 0.305 (0.311) Elapsed 211m 23s (remain 244m 1s) Loss: 0.0106(0.0625) Grad: 0.1369  
Epoch: [1][23800/51233] Data 0.309 (0.311) Elapsed 211m 34s (remain 243m 50s) Loss: 0.2027(0.0624) Grad: 2.3886  
Epoch: [1][23820/51233] Data 0.313 (0.311) Elapsed 211m 44s (remain 243m 39s) Loss: 0.0935(0.0624) Grad: 1.2220  
Epoch: [1][23840/51233] Data 0.305 (0.311) Elapsed 211m 55s (remain 243m 29s) Loss: 0.0042(0.0624) Grad: 0.0561  
Epoch: [1][23860/51233] Data 0.317 (0.311) Elapsed 212m 6s (remain 243m 18s) Loss: 0.0050(0.0624) Grad: 0.0608  
Epoch: [1][23880/51233] Data 0.308 (0.311) Elapsed 212m 16s (remain 243m 8s) Loss: 0.0281(0.0624) Grad: 0.5098  
Epoch: [1][23900/51233] Data 0.311 (0.311) Elapsed 212m 27s (remain 242m 57s) Loss: 0.1162(0.0624) Grad: 1.7118  
Epoch: [1][23920/51233] Data 0.308 (0.311) Elapsed 212m 38s (remain 242m 46s) Loss: 0.0046(0.0624) Grad: 0.0830  
Epoch: [1][23940/51233] Data 0.317 (0.311) Elapsed 212m 48s (remain 242m 36s) Loss: 0.0091(0.0624) Grad: 0.1224  
Epoch: [1][23960/51233] Data 0.293 (0.311) Elapsed 212m 59s (remain 242m 25s) Loss: 0.4814(0.0624) Grad: 3.2533  
Epoch: [1][23980/51233] Data 0.315 (0.311) Elapsed 213m 10s (remain 242m 14s) Loss: 0.0551(0.0624) Grad: 1.0342  
Epoch: [1][24000/51233] Data 0.317 (0.311) Elapsed 213m 20s (remain 242m 4s) Loss: 0.0064(0.0624) Grad: 0.0920  
Epoch: [1][24020/51233] Data 0.317 (0.311) Elapsed 213m 31s (remain 241m 53s) Loss: 0.0069(0.0624) Grad: 0.0998  
Epoch: [1][24040/51233] Data 0.318 (0.311) Elapsed 213m 42s (remain 241m 42s) Loss: 0.4190(0.0624) Grad: 2.8652  
Epoch: [1][24060/51233] Data 0.319 (0.311) Elapsed 213m 53s (remain 241m 32s) Loss: 0.0069(0.0624) Grad: 0.0880  
Epoch: [1][24080/51233] Data 0.318 (0.311) Elapsed 214m 3s (remain 241m 21s) Loss: 0.0783(0.0624) Grad: 1.1909  
Epoch: [1][24100/51233] Data 0.318 (0.311) Elapsed 214m 14s (remain 241m 10s) Loss: 0.5563(0.0624) Grad: 2.0821  
Epoch: [1][24120/51233] Data 0.317 (0.311) Elapsed 214m 25s (remain 241m 0s) Loss: 0.0493(0.0624) Grad: 0.4211  
Epoch: [1][24140/51233] Data 0.308 (0.311) Elapsed 214m 35s (remain 240m 49s) Loss: 0.0728(0.0624) Grad: 1.3874  
Epoch: [1][24160/51233] Data 0.312 (0.311) Elapsed 214m 46s (remain 240m 39s) Loss: 0.0146(0.0624) Grad: 0.4136  
Epoch: [1][24180/51233] Data 0.317 (0.311) Elapsed 214m 57s (remain 240m 28s) Loss: 0.0027(0.0624) Grad: 0.0688  
Epoch: [1][24200/51233] Data 0.318 (0.311) Elapsed 215m 7s (remain 240m 17s) Loss: 0.0330(0.0624) Grad: 0.5639  
Epoch: [1][24220/51233] Data 0.317 (0.311) Elapsed 215m 18s (remain 240m 7s) Loss: 0.0112(0.0624) Grad: 0.1167  
Epoch: [1][24240/51233] Data 0.309 (0.311) Elapsed 215m 29s (remain 239m 56s) Loss: 0.0032(0.0624) Grad: 0.0528  
Epoch: [1][24260/51233] Data 0.306 (0.311) Elapsed 215m 39s (remain 239m 45s) Loss: 0.1389(0.0624) Grad: 1.5048  
Epoch: [1][24280/51233] Data 0.317 (0.311) Elapsed 215m 50s (remain 239m 35s) Loss: 0.0309(0.0624) Grad: 0.6103  
Epoch: [1][24300/51233] Data 0.308 (0.311) Elapsed 216m 1s (remain 239m 24s) Loss: 0.0359(0.0624) Grad: 0.5597  
Epoch: [1][24320/51233] Data 0.301 (0.311) Elapsed 216m 11s (remain 239m 13s) Loss: 0.0012(0.0624) Grad: 0.0208  
Epoch: [1][24340/51233] Data 0.310 (0.311) Elapsed 216m 22s (remain 239m 3s) Loss: 0.0448(0.0624) Grad: 0.4895  
Epoch: [1][24360/51233] Data 0.310 (0.311) Elapsed 216m 33s (remain 238m 52s) Loss: 0.0538(0.0624) Grad: 0.4583  
Epoch: [1][24380/51233] Data 0.306 (0.311) Elapsed 216m 43s (remain 238m 41s) Loss: 0.0825(0.0623) Grad: 1.3829  
Epoch: [1][24400/51233] Data 0.309 (0.311) Elapsed 216m 54s (remain 238m 31s) Loss: 0.0020(0.0623) Grad: 0.0273  
Epoch: [1][24420/51233] Data 0.308 (0.311) Elapsed 217m 5s (remain 238m 20s) Loss: 0.0046(0.0623) Grad: 0.0936  
Epoch: [1][24440/51233] Data 0.307 (0.311) Elapsed 217m 16s (remain 238m 9s) Loss: 0.0075(0.0624) Grad: 0.1486  
Epoch: [1][24460/51233] Data 0.309 (0.311) Elapsed 217m 26s (remain 237m 59s) Loss: 0.0039(0.0623) Grad: 0.0931  
Epoch: [1][24480/51233] Data 0.317 (0.311) Elapsed 217m 37s (remain 237m 48s) Loss: 0.0188(0.0623) Grad: 0.2517  
Epoch: [1][24500/51233] Data 0.301 (0.311) Elapsed 217m 48s (remain 237m 38s) Loss: 0.0227(0.0623) Grad: 0.8285  
Epoch: [1][24520/51233] Data 0.307 (0.311) Elapsed 217m 58s (remain 237m 27s) Loss: 0.0434(0.0623) Grad: 0.4696  
Epoch: [1][24540/51233] Data 0.316 (0.311) Elapsed 218m 9s (remain 237m 16s) Loss: 0.0789(0.0623) Grad: 1.4150  
Epoch: [1][24560/51233] Data 0.308 (0.311) Elapsed 218m 20s (remain 237m 6s) Loss: 0.0077(0.0623) Grad: 0.1635  
Epoch: [1][24580/51233] Data 0.307 (0.311) Elapsed 218m 30s (remain 236m 55s) Loss: 0.0041(0.0622) Grad: 0.1100  
Epoch: [1][24600/51233] Data 0.317 (0.311) Elapsed 218m 41s (remain 236m 44s) Loss: 0.2043(0.0623) Grad: 1.8629  
Epoch: [1][24620/51233] Data 0.311 (0.311) Elapsed 218m 52s (remain 236m 34s) Loss: 0.0098(0.0622) Grad: 0.1007  
Epoch: [1][24640/51233] Data 0.309 (0.311) Elapsed 219m 2s (remain 236m 23s) Loss: 0.0161(0.0622) Grad: 0.3645  
Epoch: [1][24660/51233] Data 0.305 (0.311) Elapsed 219m 13s (remain 236m 12s) Loss: 0.0011(0.0622) Grad: 0.0244  
Epoch: [1][24680/51233] Data 0.317 (0.311) Elapsed 219m 24s (remain 236m 2s) Loss: 0.0533(0.0622) Grad: 1.3243  
Epoch: [1][24700/51233] Data 0.305 (0.311) Elapsed 219m 34s (remain 235m 51s) Loss: 0.0389(0.0622) Grad: 0.9119  
Epoch: [1][24720/51233] Data 0.308 (0.311) Elapsed 219m 45s (remain 235m 40s) Loss: 0.0084(0.0622) Grad: 0.1000  
Epoch: [1][24740/51233] Data 0.303 (0.311) Elapsed 219m 56s (remain 235m 30s) Loss: 0.0019(0.0622) Grad: 0.0296  
Epoch: [1][24760/51233] Data 0.318 (0.311) Elapsed 220m 6s (remain 235m 19s) Loss: 0.0102(0.0622) Grad: 0.1321  
Epoch: [1][24780/51233] Data 0.310 (0.311) Elapsed 220m 17s (remain 235m 8s) Loss: 0.0426(0.0622) Grad: 0.6071  
Epoch: [1][24800/51233] Data 0.315 (0.311) Elapsed 220m 28s (remain 234m 58s) Loss: 0.0146(0.0622) Grad: 0.2373  
Epoch: [1][24820/51233] Data 0.309 (0.311) Elapsed 220m 39s (remain 234m 47s) Loss: 0.0060(0.0622) Grad: 0.0953  
Epoch: [1][24840/51233] Data 0.306 (0.311) Elapsed 220m 49s (remain 234m 36s) Loss: 0.0785(0.0622) Grad: 1.6383  
Epoch: [1][24860/51233] Data 0.317 (0.311) Elapsed 221m 0s (remain 234m 26s) Loss: 0.0792(0.0621) Grad: 1.1361  
Epoch: [1][24880/51233] Data 0.318 (0.311) Elapsed 221m 11s (remain 234m 15s) Loss: 0.0046(0.0622) Grad: 0.0850  
Epoch: [1][24900/51233] Data 0.299 (0.311) Elapsed 221m 21s (remain 234m 5s) Loss: 0.0178(0.0621) Grad: 0.3564  
Epoch: [1][24920/51233] Data 0.310 (0.311) Elapsed 221m 32s (remain 233m 54s) Loss: 0.0270(0.0621) Grad: 0.4677  
Epoch: [1][24940/51233] Data 0.318 (0.311) Elapsed 221m 43s (remain 233m 43s) Loss: 0.0367(0.0621) Grad: 0.5993  
Epoch: [1][24960/51233] Data 0.316 (0.311) Elapsed 221m 53s (remain 233m 33s) Loss: 0.0998(0.0621) Grad: 2.6048  
Epoch: [1][24980/51233] Data 0.317 (0.311) Elapsed 222m 4s (remain 233m 22s) Loss: 0.1665(0.0621) Grad: 1.5166  
Epoch: [1][25000/51233] Data 0.308 (0.311) Elapsed 222m 15s (remain 233m 11s) Loss: 0.0478(0.0621) Grad: 1.4045  
Epoch: [1][25020/51233] Data 0.317 (0.311) Elapsed 222m 25s (remain 233m 1s) Loss: 0.0050(0.0621) Grad: 0.1010  
Epoch: [1][25040/51233] Data 0.318 (0.311) Elapsed 222m 36s (remain 232m 50s) Loss: 0.0294(0.0621) Grad: 0.6800  
Epoch: [1][25060/51233] Data 0.306 (0.311) Elapsed 222m 47s (remain 232m 39s) Loss: 0.1471(0.0621) Grad: 1.2896  
Epoch: [1][25080/51233] Data 0.308 (0.311) Elapsed 222m 57s (remain 232m 29s) Loss: 0.0026(0.0621) Grad: 0.0527  
Epoch: [1][25100/51233] Data 0.308 (0.311) Elapsed 223m 8s (remain 232m 18s) Loss: 0.2551(0.0621) Grad: 1.5659  
Epoch: [1][25120/51233] Data 0.317 (0.311) Elapsed 223m 19s (remain 232m 7s) Loss: 0.0008(0.0621) Grad: 0.0077  
Epoch: [1][25140/51233] Data 0.306 (0.311) Elapsed 223m 29s (remain 231m 57s) Loss: 0.0662(0.0621) Grad: 1.1005  
Epoch: [1][25160/51233] Data 0.317 (0.311) Elapsed 223m 40s (remain 231m 46s) Loss: 0.0232(0.0621) Grad: 0.3094  
Epoch: [1][25180/51233] Data 0.318 (0.311) Elapsed 223m 51s (remain 231m 35s) Loss: 0.0019(0.0621) Grad: 0.0509  
Epoch: [1][25200/51233] Data 0.318 (0.311) Elapsed 224m 2s (remain 231m 25s) Loss: 0.2339(0.0621) Grad: 2.2006  
Epoch: [1][25220/51233] Data 0.311 (0.311) Elapsed 224m 12s (remain 231m 14s) Loss: 0.0062(0.0621) Grad: 0.1027  
Epoch: [1][25240/51233] Data 0.317 (0.311) Elapsed 224m 23s (remain 231m 4s) Loss: 0.0026(0.0621) Grad: 0.0522  
Epoch: [1][25260/51233] Data 0.307 (0.311) Elapsed 224m 34s (remain 230m 53s) Loss: 0.4227(0.0621) Grad: 2.0298  
Epoch: [1][25280/51233] Data 0.319 (0.311) Elapsed 224m 44s (remain 230m 42s) Loss: 0.1533(0.0621) Grad: 1.5281  
Epoch: [1][25300/51233] Data 0.310 (0.311) Elapsed 224m 55s (remain 230m 32s) Loss: 0.0221(0.0621) Grad: 0.2330  
Epoch: [1][25320/51233] Data 0.311 (0.311) Elapsed 225m 6s (remain 230m 21s) Loss: 0.0928(0.0621) Grad: 1.0518  
Epoch: [1][25340/51233] Data 0.308 (0.311) Elapsed 225m 16s (remain 230m 10s) Loss: 0.0331(0.0621) Grad: 0.5442  
Epoch: [1][25360/51233] Data 0.310 (0.311) Elapsed 225m 27s (remain 230m 0s) Loss: 0.1476(0.0621) Grad: 1.2071  
Epoch: [1][25380/51233] Data 0.315 (0.311) Elapsed 225m 38s (remain 229m 49s) Loss: 0.0201(0.0620) Grad: 0.2952  
Epoch: [1][25400/51233] Data 0.308 (0.311) Elapsed 225m 48s (remain 229m 38s) Loss: 0.0028(0.0620) Grad: 0.0613  
Epoch: [1][25420/51233] Data 0.318 (0.311) Elapsed 225m 59s (remain 229m 28s) Loss: 0.1351(0.0621) Grad: 0.7630  
Epoch: [1][25440/51233] Data 0.312 (0.311) Elapsed 226m 10s (remain 229m 17s) Loss: 0.0098(0.0621) Grad: 0.1837  
Epoch: [1][25460/51233] Data 0.318 (0.311) Elapsed 226m 20s (remain 229m 6s) Loss: 0.0186(0.0621) Grad: 0.2682  
Epoch: [1][25480/51233] Data 0.306 (0.311) Elapsed 226m 31s (remain 228m 56s) Loss: 0.1240(0.0620) Grad: 1.3134  
Epoch: [1][25500/51233] Data 0.317 (0.311) Elapsed 226m 42s (remain 228m 45s) Loss: 0.0009(0.0620) Grad: 0.0088  
Epoch: [1][25520/51233] Data 0.304 (0.311) Elapsed 226m 53s (remain 228m 34s) Loss: 0.1804(0.0620) Grad: 1.7360  
Epoch: [1][25540/51233] Data 0.311 (0.311) Elapsed 227m 3s (remain 228m 24s) Loss: 0.0141(0.0620) Grad: 0.1802  
Epoch: [1][25560/51233] Data 0.311 (0.311) Elapsed 227m 14s (remain 228m 13s) Loss: 0.0097(0.0620) Grad: 0.2268  
Epoch: [1][25580/51233] Data 0.305 (0.311) Elapsed 227m 25s (remain 228m 2s) Loss: 0.0079(0.0620) Grad: 0.1196  
Epoch: [1][25600/51233] Data 0.318 (0.311) Elapsed 227m 35s (remain 227m 52s) Loss: 0.0510(0.0620) Grad: 1.2824  
Epoch: [1][25620/51233] Data 0.313 (0.311) Elapsed 227m 46s (remain 227m 41s) Loss: 0.0007(0.0620) Grad: 0.0153  
Epoch: [1][25640/51233] Data 0.307 (0.311) Elapsed 227m 57s (remain 227m 31s) Loss: 0.1863(0.0620) Grad: 1.3743  
Epoch: [1][25660/51233] Data 0.317 (0.311) Elapsed 228m 7s (remain 227m 20s) Loss: 0.0169(0.0620) Grad: 0.5582  
Epoch: [1][25680/51233] Data 0.308 (0.311) Elapsed 228m 18s (remain 227m 9s) Loss: 0.0342(0.0620) Grad: 0.5032  
Epoch: [1][25700/51233] Data 0.308 (0.311) Elapsed 228m 29s (remain 226m 59s) Loss: 0.0914(0.0620) Grad: 1.3425  
Epoch: [1][25720/51233] Data 0.317 (0.311) Elapsed 228m 39s (remain 226m 48s) Loss: 0.1445(0.0620) Grad: 1.4189  
Epoch: [1][25740/51233] Data 0.318 (0.311) Elapsed 228m 50s (remain 226m 37s) Loss: 0.0504(0.0620) Grad: 0.5321  
Epoch: [1][25760/51233] Data 0.309 (0.311) Elapsed 229m 1s (remain 226m 27s) Loss: 0.0007(0.0620) Grad: 0.0092  
Epoch: [1][25780/51233] Data 0.310 (0.311) Elapsed 229m 11s (remain 226m 16s) Loss: 0.2322(0.0620) Grad: 1.8464  
Epoch: [1][25800/51233] Data 0.317 (0.311) Elapsed 229m 22s (remain 226m 5s) Loss: 0.0787(0.0620) Grad: 1.0921  
Epoch: [1][25820/51233] Data 0.307 (0.311) Elapsed 229m 33s (remain 225m 55s) Loss: 0.4160(0.0620) Grad: 2.4004  
Epoch: [1][25840/51233] Data 0.317 (0.311) Elapsed 229m 43s (remain 225m 44s) Loss: 0.0020(0.0619) Grad: 0.0329  
Epoch: [1][25860/51233] Data 0.317 (0.311) Elapsed 229m 54s (remain 225m 33s) Loss: 0.0029(0.0620) Grad: 0.0604  
Epoch: [1][25880/51233] Data 0.309 (0.311) Elapsed 230m 5s (remain 225m 23s) Loss: 0.0436(0.0620) Grad: 1.0295  
Epoch: [1][25900/51233] Data 0.310 (0.311) Elapsed 230m 16s (remain 225m 12s) Loss: 0.0159(0.0619) Grad: 0.1844  
Epoch: [1][25920/51233] Data 0.318 (0.311) Elapsed 230m 26s (remain 225m 1s) Loss: 0.0146(0.0619) Grad: 0.3816  
Epoch: [1][25940/51233] Data 0.317 (0.311) Elapsed 230m 37s (remain 224m 51s) Loss: 0.2392(0.0619) Grad: 2.0009  
Epoch: [1][25960/51233] Data 0.315 (0.311) Elapsed 230m 48s (remain 224m 40s) Loss: 0.0981(0.0619) Grad: 2.3756  
Epoch: [1][25980/51233] Data 0.308 (0.311) Elapsed 230m 58s (remain 224m 29s) Loss: 0.0145(0.0619) Grad: 0.2269  
Epoch: [1][26000/51233] Data 0.317 (0.311) Elapsed 231m 9s (remain 224m 19s) Loss: 0.1548(0.0619) Grad: 1.8121  
Epoch: [1][26020/51233] Data 0.316 (0.311) Elapsed 231m 20s (remain 224m 8s) Loss: 0.0051(0.0619) Grad: 0.0594  
Epoch: [1][26040/51233] Data 0.308 (0.311) Elapsed 231m 30s (remain 223m 57s) Loss: 0.0211(0.0618) Grad: 0.4088  
Epoch: [1][26060/51233] Data 0.317 (0.311) Elapsed 231m 41s (remain 223m 47s) Loss: 0.0360(0.0618) Grad: 0.6419  
Epoch: [1][26080/51233] Data 0.299 (0.311) Elapsed 231m 52s (remain 223m 36s) Loss: 0.0015(0.0618) Grad: 0.0141  
Epoch: [1][26100/51233] Data 0.299 (0.311) Elapsed 232m 2s (remain 223m 26s) Loss: 0.0089(0.0618) Grad: 0.1133  
Epoch: [1][26120/51233] Data 0.310 (0.311) Elapsed 232m 13s (remain 223m 15s) Loss: 0.2275(0.0618) Grad: 2.2718  
Epoch: [1][26140/51233] Data 0.318 (0.311) Elapsed 232m 24s (remain 223m 4s) Loss: 0.0054(0.0618) Grad: 0.0590  
Epoch: [1][26160/51233] Data 0.315 (0.311) Elapsed 232m 35s (remain 222m 54s) Loss: 0.0252(0.0618) Grad: 0.4111  
Epoch: [1][26180/51233] Data 0.318 (0.311) Elapsed 232m 45s (remain 222m 43s) Loss: 0.0023(0.0618) Grad: 0.0203  
Epoch: [1][26200/51233] Data 0.318 (0.311) Elapsed 232m 56s (remain 222m 32s) Loss: 0.0075(0.0618) Grad: 0.1252  
Epoch: [1][26220/51233] Data 0.312 (0.311) Elapsed 233m 7s (remain 222m 22s) Loss: 0.0478(0.0618) Grad: 0.5161  
Epoch: [1][26240/51233] Data 0.317 (0.311) Elapsed 233m 17s (remain 222m 11s) Loss: 0.1174(0.0618) Grad: 1.5418  
Epoch: [1][26260/51233] Data 0.310 (0.311) Elapsed 233m 28s (remain 222m 0s) Loss: 0.0019(0.0618) Grad: 0.0220  
Epoch: [1][26280/51233] Data 0.317 (0.311) Elapsed 233m 39s (remain 221m 50s) Loss: 0.0020(0.0618) Grad: 0.0318  
Epoch: [1][26300/51233] Data 0.317 (0.311) Elapsed 233m 49s (remain 221m 39s) Loss: 0.0119(0.0618) Grad: 0.3980  
Epoch: [1][26320/51233] Data 0.318 (0.311) Elapsed 234m 0s (remain 221m 28s) Loss: 0.0947(0.0618) Grad: 1.2709  
Epoch: [1][26340/51233] Data 0.317 (0.311) Elapsed 234m 11s (remain 221m 18s) Loss: 0.0963(0.0618) Grad: 1.1199  
Epoch: [1][26360/51233] Data 0.300 (0.311) Elapsed 234m 21s (remain 221m 7s) Loss: 0.0663(0.0618) Grad: 1.1327  
Epoch: [1][26380/51233] Data 0.315 (0.311) Elapsed 234m 32s (remain 220m 56s) Loss: 0.0122(0.0618) Grad: 0.2889  
Epoch: [1][26400/51233] Data 0.318 (0.311) Elapsed 234m 43s (remain 220m 46s) Loss: 0.0747(0.0618) Grad: 0.7377  
Epoch: [1][26420/51233] Data 0.303 (0.311) Elapsed 234m 53s (remain 220m 35s) Loss: 0.0020(0.0618) Grad: 0.0180  
Epoch: [1][26440/51233] Data 0.318 (0.311) Elapsed 235m 4s (remain 220m 24s) Loss: 0.1180(0.0618) Grad: 1.5636  
Epoch: [1][26460/51233] Data 0.305 (0.311) Elapsed 235m 15s (remain 220m 14s) Loss: 0.0464(0.0618) Grad: 0.4715  
Epoch: [1][26480/51233] Data 0.303 (0.311) Elapsed 235m 25s (remain 220m 3s) Loss: 0.0105(0.0618) Grad: 0.1676  
Epoch: [1][26500/51233] Data 0.314 (0.311) Elapsed 235m 36s (remain 219m 52s) Loss: 0.4214(0.0618) Grad: 2.4569  
Epoch: [1][26520/51233] Data 0.308 (0.311) Elapsed 235m 47s (remain 219m 42s) Loss: 0.0318(0.0618) Grad: 0.9088  
Epoch: [1][26540/51233] Data 0.317 (0.311) Elapsed 235m 58s (remain 219m 31s) Loss: 0.0025(0.0618) Grad: 0.0230  
Epoch: [1][26560/51233] Data 0.317 (0.311) Elapsed 236m 8s (remain 219m 21s) Loss: 0.0259(0.0618) Grad: 0.4222  
Epoch: [1][26580/51233] Data 0.318 (0.311) Elapsed 236m 19s (remain 219m 10s) Loss: 0.0063(0.0618) Grad: 0.1377  
Epoch: [1][26600/51233] Data 0.307 (0.311) Elapsed 236m 30s (remain 218m 59s) Loss: 0.0214(0.0618) Grad: 0.5606  
Epoch: [1][26620/51233] Data 0.317 (0.311) Elapsed 236m 40s (remain 218m 49s) Loss: 0.0045(0.0618) Grad: 0.0437  
Epoch: [1][26640/51233] Data 0.317 (0.311) Elapsed 236m 51s (remain 218m 38s) Loss: 0.0042(0.0618) Grad: 0.0876  
Epoch: [1][26660/51233] Data 0.317 (0.311) Elapsed 237m 2s (remain 218m 27s) Loss: 0.0259(0.0618) Grad: 0.3095  
Epoch: [1][26680/51233] Data 0.317 (0.311) Elapsed 237m 12s (remain 218m 17s) Loss: 0.0889(0.0618) Grad: 1.3816  
Epoch: [1][26700/51233] Data 0.310 (0.311) Elapsed 237m 23s (remain 218m 6s) Loss: 0.0372(0.0617) Grad: 0.8978  
Epoch: [1][26720/51233] Data 0.318 (0.311) Elapsed 237m 34s (remain 217m 55s) Loss: 0.0098(0.0617) Grad: 0.1727  
Epoch: [1][26740/51233] Data 0.309 (0.311) Elapsed 237m 44s (remain 217m 45s) Loss: 0.0032(0.0617) Grad: 0.0341  
Epoch: [1][26760/51233] Data 0.319 (0.311) Elapsed 237m 55s (remain 217m 34s) Loss: 0.1339(0.0617) Grad: 1.8093  
Epoch: [1][26780/51233] Data 0.317 (0.311) Elapsed 238m 6s (remain 217m 23s) Loss: 0.0078(0.0617) Grad: 0.1183  
Epoch: [1][26800/51233] Data 0.309 (0.311) Elapsed 238m 16s (remain 217m 13s) Loss: 0.1351(0.0617) Grad: 1.2708  
Epoch: [1][26820/51233] Data 0.310 (0.311) Elapsed 238m 27s (remain 217m 2s) Loss: 0.0375(0.0617) Grad: 0.4689  
Epoch: [1][26840/51233] Data 0.310 (0.311) Elapsed 238m 38s (remain 216m 51s) Loss: 0.0032(0.0617) Grad: 0.0646  
Epoch: [1][26860/51233] Data 0.313 (0.311) Elapsed 238m 48s (remain 216m 41s) Loss: 0.0155(0.0617) Grad: 0.2888  
Epoch: [1][26880/51233] Data 0.314 (0.311) Elapsed 238m 59s (remain 216m 30s) Loss: 0.0039(0.0617) Grad: 0.0687  
Epoch: [1][26900/51233] Data 0.308 (0.311) Elapsed 239m 10s (remain 216m 19s) Loss: 0.0005(0.0617) Grad: 0.0057  
Epoch: [1][26920/51233] Data 0.300 (0.311) Elapsed 239m 21s (remain 216m 9s) Loss: 0.1308(0.0617) Grad: 2.2234  
Epoch: [1][26940/51233] Data 0.313 (0.311) Elapsed 239m 31s (remain 215m 58s) Loss: 0.0370(0.0617) Grad: 0.6858  
Epoch: [1][26960/51233] Data 0.318 (0.311) Elapsed 239m 42s (remain 215m 47s) Loss: 0.0029(0.0617) Grad: 0.0648  
Epoch: [1][26980/51233] Data 0.292 (0.311) Elapsed 239m 53s (remain 215m 37s) Loss: 0.0088(0.0617) Grad: 0.0833  
Epoch: [1][27000/51233] Data 0.317 (0.311) Elapsed 240m 3s (remain 215m 26s) Loss: 0.0362(0.0617) Grad: 0.4212  
Epoch: [1][27020/51233] Data 0.307 (0.311) Elapsed 240m 14s (remain 215m 16s) Loss: 0.0921(0.0617) Grad: 1.2183  
Epoch: [1][27040/51233] Data 0.297 (0.311) Elapsed 240m 25s (remain 215m 5s) Loss: 0.0346(0.0617) Grad: 0.5040  
Epoch: [1][27060/51233] Data 0.308 (0.311) Elapsed 240m 35s (remain 214m 54s) Loss: 0.0064(0.0617) Grad: 0.0945  
Epoch: [1][27080/51233] Data 0.314 (0.311) Elapsed 240m 46s (remain 214m 44s) Loss: 0.0049(0.0617) Grad: 0.0637  
Epoch: [1][27100/51233] Data 0.318 (0.311) Elapsed 240m 57s (remain 214m 33s) Loss: 0.2132(0.0617) Grad: 0.7752  
Epoch: [1][27120/51233] Data 0.313 (0.311) Elapsed 241m 7s (remain 214m 22s) Loss: 0.0930(0.0618) Grad: 1.0722  
Epoch: [1][27140/51233] Data 0.313 (0.311) Elapsed 241m 18s (remain 214m 12s) Loss: 0.0711(0.0617) Grad: 1.0972  
Epoch: [1][27160/51233] Data 0.322 (0.311) Elapsed 241m 29s (remain 214m 1s) Loss: 0.3471(0.0617) Grad: 1.7309  
Epoch: [1][27180/51233] Data 0.314 (0.311) Elapsed 241m 39s (remain 213m 50s) Loss: 0.0308(0.0617) Grad: 0.4604  
Epoch: [1][27200/51233] Data 0.315 (0.311) Elapsed 241m 50s (remain 213m 40s) Loss: 0.0332(0.0617) Grad: 0.6571  
Epoch: [1][27220/51233] Data 0.308 (0.311) Elapsed 242m 1s (remain 213m 29s) Loss: 0.0463(0.0617) Grad: 0.5535  
Epoch: [1][27240/51233] Data 0.318 (0.311) Elapsed 242m 12s (remain 213m 18s) Loss: 0.0012(0.0617) Grad: 0.0150  
Epoch: [1][27260/51233] Data 0.304 (0.311) Elapsed 242m 22s (remain 213m 8s) Loss: 0.0046(0.0617) Grad: 0.1145  
Epoch: [1][27280/51233] Data 0.309 (0.311) Elapsed 242m 33s (remain 212m 57s) Loss: 0.0312(0.0617) Grad: 0.6013  
Epoch: [1][27300/51233] Data 0.315 (0.311) Elapsed 242m 44s (remain 212m 46s) Loss: 0.0095(0.0617) Grad: 0.1122  
Epoch: [1][27320/51233] Data 0.318 (0.311) Elapsed 242m 54s (remain 212m 36s) Loss: 0.0149(0.0617) Grad: 0.2445  
Epoch: [1][27340/51233] Data 0.311 (0.311) Elapsed 243m 5s (remain 212m 25s) Loss: 0.0315(0.0617) Grad: 0.9149  
Epoch: [1][27360/51233] Data 0.317 (0.311) Elapsed 243m 16s (remain 212m 14s) Loss: 0.0146(0.0617) Grad: 0.1303  
Epoch: [1][27380/51233] Data 0.318 (0.311) Elapsed 243m 26s (remain 212m 4s) Loss: 0.1828(0.0617) Grad: 1.1993  
Epoch: [1][27400/51233] Data 0.306 (0.311) Elapsed 243m 37s (remain 211m 53s) Loss: 0.0262(0.0617) Grad: 0.5269  
Epoch: [1][27420/51233] Data 0.317 (0.311) Elapsed 243m 48s (remain 211m 42s) Loss: 0.2202(0.0617) Grad: 2.2783  
Epoch: [1][27440/51233] Data 0.314 (0.311) Elapsed 243m 58s (remain 211m 32s) Loss: 0.0097(0.0617) Grad: 0.1411  
Epoch: [1][27460/51233] Data 0.317 (0.311) Elapsed 244m 9s (remain 211m 21s) Loss: 0.1633(0.0617) Grad: 1.5575  
Epoch: [1][27480/51233] Data 0.314 (0.311) Elapsed 244m 20s (remain 211m 10s) Loss: 0.0246(0.0617) Grad: 0.5768  
Epoch: [1][27500/51233] Data 0.308 (0.311) Elapsed 244m 30s (remain 211m 0s) Loss: 0.0346(0.0616) Grad: 0.4833  
Epoch: [1][27520/51233] Data 0.307 (0.311) Elapsed 244m 41s (remain 210m 49s) Loss: 0.0632(0.0616) Grad: 0.9738  
Epoch: [1][27540/51233] Data 0.311 (0.311) Elapsed 244m 52s (remain 210m 38s) Loss: 0.0093(0.0616) Grad: 0.2068  
Epoch: [1][27560/51233] Data 0.306 (0.311) Elapsed 245m 2s (remain 210m 28s) Loss: 0.0225(0.0616) Grad: 0.2746  
Epoch: [1][27580/51233] Data 0.310 (0.311) Elapsed 245m 13s (remain 210m 17s) Loss: 0.0018(0.0616) Grad: 0.0238  
Epoch: [1][27600/51233] Data 0.317 (0.311) Elapsed 245m 24s (remain 210m 6s) Loss: 0.0113(0.0617) Grad: 0.1412  
Epoch: [1][27620/51233] Data 0.308 (0.311) Elapsed 245m 34s (remain 209m 56s) Loss: 0.0194(0.0617) Grad: 0.2467  
Epoch: [1][27640/51233] Data 0.317 (0.311) Elapsed 245m 45s (remain 209m 45s) Loss: 0.0048(0.0616) Grad: 0.0806  
Epoch: [1][27660/51233] Data 0.316 (0.311) Elapsed 245m 56s (remain 209m 34s) Loss: 0.0472(0.0616) Grad: 0.8133  
Epoch: [1][27680/51233] Data 0.317 (0.311) Elapsed 246m 6s (remain 209m 24s) Loss: 0.0207(0.0616) Grad: 0.2844  
Epoch: [1][27700/51233] Data 0.300 (0.311) Elapsed 246m 17s (remain 209m 13s) Loss: 0.0020(0.0616) Grad: 0.0248  
Epoch: [1][27720/51233] Data 0.317 (0.311) Elapsed 246m 28s (remain 209m 2s) Loss: 0.0265(0.0616) Grad: 0.4111  
Epoch: [1][27740/51233] Data 0.311 (0.311) Elapsed 246m 38s (remain 208m 52s) Loss: 0.0148(0.0616) Grad: 0.2978  
Epoch: [1][27760/51233] Data 0.300 (0.311) Elapsed 246m 49s (remain 208m 41s) Loss: 0.1517(0.0616) Grad: 2.2190  
Epoch: [1][27780/51233] Data 0.309 (0.311) Elapsed 247m 0s (remain 208m 30s) Loss: 0.0819(0.0616) Grad: 0.7295  
Epoch: [1][27800/51233] Data 0.310 (0.311) Elapsed 247m 10s (remain 208m 20s) Loss: 0.0022(0.0616) Grad: 0.0358  
Epoch: [1][27820/51233] Data 0.306 (0.311) Elapsed 247m 21s (remain 208m 9s) Loss: 0.0159(0.0616) Grad: 0.1926  
Epoch: [1][27840/51233] Data 0.305 (0.311) Elapsed 247m 32s (remain 207m 58s) Loss: 0.2600(0.0616) Grad: 1.8308  
Epoch: [1][27860/51233] Data 0.316 (0.311) Elapsed 247m 42s (remain 207m 48s) Loss: 0.0060(0.0616) Grad: 0.1163  
Epoch: [1][27880/51233] Data 0.305 (0.311) Elapsed 247m 53s (remain 207m 37s) Loss: 0.0140(0.0616) Grad: 0.2816  
Epoch: [1][27900/51233] Data 0.318 (0.311) Elapsed 248m 4s (remain 207m 26s) Loss: 0.0726(0.0615) Grad: 0.5185  
Epoch: [1][27920/51233] Data 0.317 (0.311) Elapsed 248m 14s (remain 207m 16s) Loss: 0.0232(0.0615) Grad: 0.3922  
Epoch: [1][27940/51233] Data 0.302 (0.311) Elapsed 248m 25s (remain 207m 5s) Loss: 0.0388(0.0615) Grad: 0.6476  
Epoch: [1][27960/51233] Data 0.307 (0.311) Elapsed 248m 36s (remain 206m 54s) Loss: 0.0059(0.0615) Grad: 0.0936  
Epoch: [1][27980/51233] Data 0.309 (0.311) Elapsed 248m 47s (remain 206m 44s) Loss: 0.0365(0.0615) Grad: 0.4217  
Epoch: [1][28000/51233] Data 0.309 (0.311) Elapsed 248m 57s (remain 206m 33s) Loss: 0.1707(0.0615) Grad: 2.0453  
Epoch: [1][28020/51233] Data 0.318 (0.311) Elapsed 249m 8s (remain 206m 22s) Loss: 0.0115(0.0615) Grad: 0.1414  
Epoch: [1][28040/51233] Data 0.318 (0.311) Elapsed 249m 19s (remain 206m 12s) Loss: 0.0017(0.0615) Grad: 0.0473  
Epoch: [1][28060/51233] Data 0.311 (0.311) Elapsed 249m 29s (remain 206m 1s) Loss: 0.0017(0.0614) Grad: 0.0349  
Epoch: [1][28080/51233] Data 0.309 (0.311) Elapsed 249m 40s (remain 205m 50s) Loss: 0.0321(0.0615) Grad: 0.4074  
Epoch: [1][28100/51233] Data 0.311 (0.311) Elapsed 249m 51s (remain 205m 40s) Loss: 0.0426(0.0615) Grad: 0.5067  
Epoch: [1][28120/51233] Data 0.304 (0.311) Elapsed 250m 1s (remain 205m 29s) Loss: 0.0099(0.0614) Grad: 0.1693  
Epoch: [1][28140/51233] Data 0.314 (0.311) Elapsed 250m 12s (remain 205m 18s) Loss: 0.0133(0.0614) Grad: 0.2708  
Epoch: [1][28160/51233] Data 0.301 (0.311) Elapsed 250m 23s (remain 205m 8s) Loss: 0.0682(0.0614) Grad: 0.9299  
Epoch: [1][28180/51233] Data 0.309 (0.311) Elapsed 250m 33s (remain 204m 57s) Loss: 0.0055(0.0614) Grad: 0.1253  
Epoch: [1][28200/51233] Data 0.318 (0.311) Elapsed 250m 44s (remain 204m 47s) Loss: 0.0115(0.0614) Grad: 0.1231  
Epoch: [1][28220/51233] Data 0.315 (0.311) Elapsed 250m 55s (remain 204m 36s) Loss: 0.0851(0.0614) Grad: 1.6546  
Epoch: [1][28240/51233] Data 0.308 (0.311) Elapsed 251m 5s (remain 204m 25s) Loss: 0.0022(0.0614) Grad: 0.0351  
Epoch: [1][28260/51233] Data 0.316 (0.311) Elapsed 251m 16s (remain 204m 15s) Loss: 0.1350(0.0614) Grad: 1.6674  
Epoch: [1][28280/51233] Data 0.317 (0.311) Elapsed 251m 27s (remain 204m 4s) Loss: 0.0126(0.0614) Grad: 0.1895  
Epoch: [1][28300/51233] Data 0.314 (0.311) Elapsed 251m 37s (remain 203m 53s) Loss: 0.0299(0.0614) Grad: 0.8510  
Epoch: [1][28320/51233] Data 0.309 (0.311) Elapsed 251m 48s (remain 203m 43s) Loss: 0.1106(0.0614) Grad: 1.0303  
Epoch: [1][28340/51233] Data 0.317 (0.311) Elapsed 251m 59s (remain 203m 32s) Loss: 0.0068(0.0614) Grad: 0.0741  
Epoch: [1][28360/51233] Data 0.309 (0.311) Elapsed 252m 10s (remain 203m 21s) Loss: 0.0073(0.0614) Grad: 0.0819  
Epoch: [1][28380/51233] Data 0.304 (0.311) Elapsed 252m 20s (remain 203m 11s) Loss: 0.0033(0.0613) Grad: 0.0412  
Epoch: [1][28400/51233] Data 0.318 (0.311) Elapsed 252m 31s (remain 203m 0s) Loss: 0.0071(0.0613) Grad: 0.1102  
Epoch: [1][28420/51233] Data 0.313 (0.311) Elapsed 252m 42s (remain 202m 49s) Loss: 0.0100(0.0613) Grad: 0.1919  
Epoch: [1][28440/51233] Data 0.317 (0.311) Elapsed 252m 52s (remain 202m 39s) Loss: 0.0629(0.0613) Grad: 1.2264  
Epoch: [1][28460/51233] Data 0.309 (0.311) Elapsed 253m 3s (remain 202m 28s) Loss: 0.0295(0.0613) Grad: 0.6342  
Epoch: [1][28480/51233] Data 0.317 (0.311) Elapsed 253m 14s (remain 202m 17s) Loss: 0.0016(0.0613) Grad: 0.0303  
Epoch: [1][28500/51233] Data 0.309 (0.311) Elapsed 253m 24s (remain 202m 7s) Loss: 0.0042(0.0613) Grad: 0.0670  
Epoch: [1][28520/51233] Data 0.317 (0.311) Elapsed 253m 35s (remain 201m 56s) Loss: 0.0071(0.0613) Grad: 0.0801  
Epoch: [1][28540/51233] Data 0.308 (0.311) Elapsed 253m 46s (remain 201m 45s) Loss: 0.0489(0.0613) Grad: 0.8299  
Epoch: [1][28560/51233] Data 0.309 (0.311) Elapsed 253m 56s (remain 201m 35s) Loss: 0.0093(0.0613) Grad: 0.1737  
Epoch: [1][28580/51233] Data 0.310 (0.311) Elapsed 254m 7s (remain 201m 24s) Loss: 0.0288(0.0613) Grad: 0.3462  
Epoch: [1][28600/51233] Data 0.317 (0.311) Elapsed 254m 18s (remain 201m 13s) Loss: 0.0494(0.0613) Grad: 0.8798  
Epoch: [1][28620/51233] Data 0.305 (0.311) Elapsed 254m 28s (remain 201m 3s) Loss: 0.0032(0.0613) Grad: 0.0815  
Epoch: [1][28640/51233] Data 0.309 (0.311) Elapsed 254m 39s (remain 200m 52s) Loss: 0.0460(0.0613) Grad: 0.9058  
Epoch: [1][28660/51233] Data 0.314 (0.311) Elapsed 254m 50s (remain 200m 41s) Loss: 0.0791(0.0613) Grad: 0.3209  
Epoch: [1][28680/51233] Data 0.305 (0.311) Elapsed 255m 1s (remain 200m 31s) Loss: 0.0654(0.0613) Grad: 1.3268  
Epoch: [1][28700/51233] Data 0.309 (0.311) Elapsed 255m 11s (remain 200m 20s) Loss: 0.0059(0.0613) Grad: 0.0887  
Epoch: [1][28720/51233] Data 0.318 (0.311) Elapsed 255m 22s (remain 200m 9s) Loss: 0.1381(0.0613) Grad: 0.8695  
Epoch: [1][28740/51233] Data 0.314 (0.311) Elapsed 255m 33s (remain 199m 59s) Loss: 0.0022(0.0613) Grad: 0.0342  
Epoch: [1][28760/51233] Data 0.317 (0.311) Elapsed 255m 43s (remain 199m 48s) Loss: 0.3008(0.0613) Grad: 2.2165  
Epoch: [1][28780/51233] Data 0.306 (0.311) Elapsed 255m 54s (remain 199m 37s) Loss: 0.0145(0.0613) Grad: 0.3437  
Epoch: [1][28800/51233] Data 0.299 (0.311) Elapsed 256m 5s (remain 199m 27s) Loss: 0.0207(0.0612) Grad: 0.5396  
Epoch: [1][28820/51233] Data 0.312 (0.311) Elapsed 256m 15s (remain 199m 16s) Loss: 0.0124(0.0612) Grad: 0.1664  
Epoch: [1][28840/51233] Data 0.305 (0.311) Elapsed 256m 26s (remain 199m 6s) Loss: 0.0011(0.0612) Grad: 0.0233  
Epoch: [1][28860/51233] Data 0.317 (0.311) Elapsed 256m 37s (remain 198m 55s) Loss: 0.1781(0.0612) Grad: 1.9374  
Epoch: [1][28880/51233] Data 0.309 (0.311) Elapsed 256m 47s (remain 198m 44s) Loss: 0.0709(0.0612) Grad: 0.9518  
Epoch: [1][28900/51233] Data 0.309 (0.311) Elapsed 256m 58s (remain 198m 34s) Loss: 0.0223(0.0612) Grad: 0.3580  
Epoch: [1][28920/51233] Data 0.307 (0.311) Elapsed 257m 9s (remain 198m 23s) Loss: 0.0011(0.0612) Grad: 0.0231  
Epoch: [1][28940/51233] Data 0.307 (0.311) Elapsed 257m 19s (remain 198m 12s) Loss: 0.0406(0.0612) Grad: 0.7130  
Epoch: [1][28960/51233] Data 0.306 (0.311) Elapsed 257m 30s (remain 198m 2s) Loss: 0.0056(0.0612) Grad: 0.1366  
Epoch: [1][28980/51233] Data 0.297 (0.311) Elapsed 257m 41s (remain 197m 51s) Loss: 0.0773(0.0612) Grad: 1.4059  
Epoch: [1][29000/51233] Data 0.308 (0.311) Elapsed 257m 51s (remain 197m 40s) Loss: 0.0496(0.0612) Grad: 1.8144  
Epoch: [1][29020/51233] Data 0.317 (0.311) Elapsed 258m 2s (remain 197m 30s) Loss: 0.0077(0.0612) Grad: 0.1599  
Epoch: [1][29040/51233] Data 0.313 (0.311) Elapsed 258m 13s (remain 197m 19s) Loss: 0.0051(0.0612) Grad: 0.0735  
Epoch: [1][29060/51233] Data 0.306 (0.311) Elapsed 258m 24s (remain 197m 8s) Loss: 0.4571(0.0612) Grad: 2.3948  
Epoch: [1][29080/51233] Data 0.305 (0.311) Elapsed 258m 34s (remain 196m 58s) Loss: 0.2466(0.0612) Grad: 2.3368  
Epoch: [1][29100/51233] Data 0.318 (0.311) Elapsed 258m 45s (remain 196m 47s) Loss: 0.0061(0.0612) Grad: 0.1623  
Epoch: [1][29120/51233] Data 0.317 (0.311) Elapsed 258m 56s (remain 196m 36s) Loss: 0.0067(0.0612) Grad: 0.0906  
Epoch: [1][29140/51233] Data 0.310 (0.311) Elapsed 259m 6s (remain 196m 26s) Loss: 0.0134(0.0612) Grad: 0.1783  
Epoch: [1][29160/51233] Data 0.311 (0.311) Elapsed 259m 17s (remain 196m 15s) Loss: 0.0119(0.0612) Grad: 0.1975  
Epoch: [1][29180/51233] Data 0.296 (0.311) Elapsed 259m 28s (remain 196m 4s) Loss: 0.0292(0.0611) Grad: 0.6961  
Epoch: [1][29200/51233] Data 0.306 (0.311) Elapsed 259m 38s (remain 195m 54s) Loss: 0.0332(0.0611) Grad: 0.5617  
Epoch: [1][29220/51233] Data 0.309 (0.311) Elapsed 259m 49s (remain 195m 43s) Loss: 0.0566(0.0611) Grad: 0.5144  
Epoch: [1][29240/51233] Data 0.314 (0.311) Elapsed 260m 0s (remain 195m 32s) Loss: 0.0376(0.0611) Grad: 0.7022  
Epoch: [1][29260/51233] Data 0.317 (0.311) Elapsed 260m 10s (remain 195m 22s) Loss: 0.0090(0.0611) Grad: 0.2503  
Epoch: [1][29280/51233] Data 0.310 (0.311) Elapsed 260m 21s (remain 195m 11s) Loss: 0.0120(0.0611) Grad: 0.1201  
Epoch: [1][29300/51233] Data 0.309 (0.311) Elapsed 260m 32s (remain 195m 0s) Loss: 0.0021(0.0611) Grad: 0.0320  
Epoch: [1][29320/51233] Data 0.318 (0.311) Elapsed 260m 42s (remain 194m 50s) Loss: 0.0006(0.0611) Grad: 0.0123  
Epoch: [1][29340/51233] Data 0.318 (0.311) Elapsed 260m 53s (remain 194m 39s) Loss: 0.0543(0.0611) Grad: 0.8089  
Epoch: [1][29360/51233] Data 0.295 (0.311) Elapsed 261m 4s (remain 194m 28s) Loss: 0.0440(0.0611) Grad: 0.9484  
Epoch: [1][29380/51233] Data 0.313 (0.311) Elapsed 261m 15s (remain 194m 18s) Loss: 0.3355(0.0611) Grad: 1.4479  
Epoch: [1][29400/51233] Data 0.313 (0.311) Elapsed 261m 25s (remain 194m 7s) Loss: 0.0209(0.0610) Grad: 0.5030  
Epoch: [1][29420/51233] Data 0.309 (0.311) Elapsed 261m 36s (remain 193m 56s) Loss: 0.0644(0.0610) Grad: 0.5087  
Epoch: [1][29440/51233] Data 0.317 (0.311) Elapsed 261m 47s (remain 193m 46s) Loss: 0.0037(0.0610) Grad: 0.0858  
Epoch: [1][29460/51233] Data 0.310 (0.311) Elapsed 261m 57s (remain 193m 35s) Loss: 0.0102(0.0610) Grad: 0.1361  
Epoch: [1][29480/51233] Data 0.307 (0.311) Elapsed 262m 8s (remain 193m 24s) Loss: 0.0141(0.0610) Grad: 0.3993  
Epoch: [1][29500/51233] Data 0.309 (0.311) Elapsed 262m 19s (remain 193m 14s) Loss: 0.0013(0.0610) Grad: 0.0151  
Epoch: [1][29520/51233] Data 0.308 (0.311) Elapsed 262m 29s (remain 193m 3s) Loss: 0.0084(0.0610) Grad: 0.2823  
Epoch: [1][29540/51233] Data 0.308 (0.311) Elapsed 262m 40s (remain 192m 52s) Loss: 0.0086(0.0610) Grad: 0.1382  
Epoch: [1][29560/51233] Data 0.307 (0.311) Elapsed 262m 51s (remain 192m 42s) Loss: 0.1881(0.0610) Grad: 0.9982  
Epoch: [1][29580/51233] Data 0.290 (0.311) Elapsed 263m 1s (remain 192m 31s) Loss: 0.0425(0.0610) Grad: 0.4184  
Epoch: [1][29600/51233] Data 0.310 (0.311) Elapsed 263m 12s (remain 192m 20s) Loss: 0.0749(0.0611) Grad: 0.9775  
Epoch: [1][29620/51233] Data 0.318 (0.311) Elapsed 263m 23s (remain 192m 10s) Loss: 0.0137(0.0610) Grad: 0.1871  
Epoch: [1][29640/51233] Data 0.317 (0.311) Elapsed 263m 33s (remain 191m 59s) Loss: 0.0058(0.0610) Grad: 0.0854  
Epoch: [1][29660/51233] Data 0.306 (0.311) Elapsed 263m 44s (remain 191m 49s) Loss: 0.0506(0.0610) Grad: 0.5966  
Epoch: [1][29680/51233] Data 0.293 (0.311) Elapsed 263m 55s (remain 191m 38s) Loss: 0.1053(0.0610) Grad: 0.8341  
Epoch: [1][29700/51233] Data 0.309 (0.311) Elapsed 264m 5s (remain 191m 27s) Loss: 0.2513(0.0610) Grad: 1.4324  
Epoch: [1][29720/51233] Data 0.302 (0.311) Elapsed 264m 16s (remain 191m 17s) Loss: 0.0344(0.0610) Grad: 0.4746  
Epoch: [1][29740/51233] Data 0.317 (0.311) Elapsed 264m 27s (remain 191m 6s) Loss: 0.0113(0.0610) Grad: 0.2950  
Epoch: [1][29760/51233] Data 0.311 (0.311) Elapsed 264m 38s (remain 190m 55s) Loss: 0.0053(0.0610) Grad: 0.0922  
Epoch: [1][29780/51233] Data 0.310 (0.311) Elapsed 264m 48s (remain 190m 45s) Loss: 0.1138(0.0610) Grad: 1.7381  
Epoch: [1][29800/51233] Data 0.317 (0.311) Elapsed 264m 59s (remain 190m 34s) Loss: 0.2332(0.0610) Grad: 2.8201  
Epoch: [1][29820/51233] Data 0.318 (0.311) Elapsed 265m 10s (remain 190m 23s) Loss: 0.1409(0.0610) Grad: 1.4642  
Epoch: [1][29840/51233] Data 0.314 (0.311) Elapsed 265m 20s (remain 190m 13s) Loss: 0.0294(0.0610) Grad: 0.4499  
Epoch: [1][29860/51233] Data 0.309 (0.311) Elapsed 265m 31s (remain 190m 2s) Loss: 0.0047(0.0610) Grad: 0.0867  
Epoch: [1][29880/51233] Data 0.310 (0.311) Elapsed 265m 42s (remain 189m 51s) Loss: 0.0055(0.0610) Grad: 0.0703  
Epoch: [1][29900/51233] Data 0.315 (0.311) Elapsed 265m 52s (remain 189m 41s) Loss: 0.0124(0.0610) Grad: 0.1478  
Epoch: [1][29920/51233] Data 0.309 (0.311) Elapsed 266m 3s (remain 189m 30s) Loss: 0.0605(0.0610) Grad: 0.9392  
Epoch: [1][29940/51233] Data 0.308 (0.311) Elapsed 266m 14s (remain 189m 19s) Loss: 0.0816(0.0610) Grad: 1.1842  
Epoch: [1][29960/51233] Data 0.297 (0.311) Elapsed 266m 24s (remain 189m 9s) Loss: 0.1365(0.0609) Grad: 1.2885  
Epoch: [1][29980/51233] Data 0.315 (0.311) Elapsed 266m 35s (remain 188m 58s) Loss: 0.0156(0.0609) Grad: 0.1846  
Epoch: [1][30000/51233] Data 0.317 (0.311) Elapsed 266m 46s (remain 188m 47s) Loss: 0.0172(0.0609) Grad: 0.3254  
Epoch: [1][30020/51233] Data 0.297 (0.311) Elapsed 266m 56s (remain 188m 37s) Loss: 0.0394(0.0609) Grad: 0.6200  
Epoch: [1][30040/51233] Data 0.317 (0.311) Elapsed 267m 7s (remain 188m 26s) Loss: 0.0032(0.0609) Grad: 0.0431  
Epoch: [1][30060/51233] Data 0.304 (0.311) Elapsed 267m 18s (remain 188m 15s) Loss: 0.1344(0.0609) Grad: 1.4760  
Epoch: [1][30080/51233] Data 0.313 (0.311) Elapsed 267m 29s (remain 188m 5s) Loss: 0.0193(0.0609) Grad: 0.3722  
Epoch: [1][30100/51233] Data 0.307 (0.311) Elapsed 267m 39s (remain 187m 54s) Loss: 0.0030(0.0609) Grad: 0.0504  
Epoch: [1][30120/51233] Data 0.317 (0.311) Elapsed 267m 50s (remain 187m 43s) Loss: 0.0019(0.0609) Grad: 0.0253  
Epoch: [1][30140/51233] Data 0.311 (0.311) Elapsed 268m 1s (remain 187m 33s) Loss: 0.0087(0.0609) Grad: 0.0995  
Epoch: [1][30160/51233] Data 0.307 (0.311) Elapsed 268m 11s (remain 187m 22s) Loss: 0.1842(0.0609) Grad: 2.5051  
Epoch: [1][30180/51233] Data 0.311 (0.311) Elapsed 268m 22s (remain 187m 11s) Loss: 0.0075(0.0609) Grad: 0.2294  
Epoch: [1][30200/51233] Data 0.315 (0.311) Elapsed 268m 33s (remain 187m 1s) Loss: 0.0060(0.0609) Grad: 0.0870  
Epoch: [1][30220/51233] Data 0.317 (0.311) Elapsed 268m 43s (remain 186m 50s) Loss: 0.0600(0.0609) Grad: 0.8316  
Epoch: [1][30240/51233] Data 0.309 (0.311) Elapsed 268m 54s (remain 186m 39s) Loss: 0.0875(0.0609) Grad: 1.2740  
Epoch: [1][30260/51233] Data 0.314 (0.311) Elapsed 269m 5s (remain 186m 29s) Loss: 0.0116(0.0609) Grad: 0.2061  
Epoch: [1][30280/51233] Data 0.318 (0.311) Elapsed 269m 15s (remain 186m 18s) Loss: 0.0864(0.0609) Grad: 1.1079  
Epoch: [1][30300/51233] Data 0.318 (0.311) Elapsed 269m 26s (remain 186m 7s) Loss: 0.0027(0.0609) Grad: 0.0342  
Epoch: [1][30320/51233] Data 0.294 (0.311) Elapsed 269m 37s (remain 185m 57s) Loss: 0.0073(0.0609) Grad: 0.1204  
Epoch: [1][30340/51233] Data 0.318 (0.311) Elapsed 269m 47s (remain 185m 46s) Loss: 0.0614(0.0609) Grad: 0.9234  
Epoch: [1][30360/51233] Data 0.317 (0.311) Elapsed 269m 58s (remain 185m 35s) Loss: 0.0515(0.0609) Grad: 0.9556  
Epoch: [1][30380/51233] Data 0.318 (0.311) Elapsed 270m 9s (remain 185m 25s) Loss: 0.0198(0.0609) Grad: 0.5163  
Epoch: [1][30400/51233] Data 0.316 (0.311) Elapsed 270m 19s (remain 185m 14s) Loss: 0.0078(0.0609) Grad: 0.1720  
Epoch: [1][30420/51233] Data 0.317 (0.311) Elapsed 270m 30s (remain 185m 3s) Loss: 0.0023(0.0609) Grad: 0.0281  
Epoch: [1][30440/51233] Data 0.308 (0.311) Elapsed 270m 41s (remain 184m 53s) Loss: 0.0017(0.0609) Grad: 0.0186  
Epoch: [1][30460/51233] Data 0.303 (0.311) Elapsed 270m 52s (remain 184m 42s) Loss: 0.0113(0.0608) Grad: 0.2276  
Epoch: [1][30480/51233] Data 0.306 (0.311) Elapsed 271m 2s (remain 184m 31s) Loss: 0.0938(0.0608) Grad: 1.0808  
Epoch: [1][30500/51233] Data 0.311 (0.311) Elapsed 271m 13s (remain 184m 21s) Loss: 0.0173(0.0608) Grad: 0.2391  
Epoch: [1][30520/51233] Data 0.308 (0.311) Elapsed 271m 24s (remain 184m 10s) Loss: 0.1360(0.0608) Grad: 1.1248  
Epoch: [1][30540/51233] Data 0.318 (0.311) Elapsed 271m 34s (remain 183m 59s) Loss: 0.0141(0.0609) Grad: 0.3435  
Epoch: [1][30560/51233] Data 0.311 (0.311) Elapsed 271m 45s (remain 183m 49s) Loss: 0.0419(0.0608) Grad: 0.6285  
Epoch: [1][30580/51233] Data 0.311 (0.311) Elapsed 271m 56s (remain 183m 38s) Loss: 0.2714(0.0608) Grad: 2.5573  
Epoch: [1][30600/51233] Data 0.303 (0.311) Elapsed 272m 6s (remain 183m 27s) Loss: 0.0008(0.0608) Grad: 0.0127  
Epoch: [1][30620/51233] Data 0.309 (0.311) Elapsed 272m 17s (remain 183m 17s) Loss: 0.0158(0.0608) Grad: 0.2651  
Epoch: [1][30640/51233] Data 0.317 (0.311) Elapsed 272m 28s (remain 183m 6s) Loss: 0.0032(0.0608) Grad: 0.0344  
Epoch: [1][30660/51233] Data 0.318 (0.311) Elapsed 272m 38s (remain 182m 55s) Loss: 0.0008(0.0608) Grad: 0.0141  
Epoch: [1][30680/51233] Data 0.318 (0.311) Elapsed 272m 49s (remain 182m 45s) Loss: 0.0090(0.0608) Grad: 0.1864  
Epoch: [1][30700/51233] Data 0.307 (0.311) Elapsed 273m 0s (remain 182m 34s) Loss: 0.0292(0.0608) Grad: 0.5772  
Epoch: [1][30720/51233] Data 0.309 (0.311) Elapsed 273m 10s (remain 182m 24s) Loss: 0.1961(0.0608) Grad: 2.2935  
Epoch: [1][30740/51233] Data 0.315 (0.311) Elapsed 273m 21s (remain 182m 13s) Loss: 0.0065(0.0608) Grad: 0.1022  
Epoch: [1][30760/51233] Data 0.305 (0.311) Elapsed 273m 32s (remain 182m 2s) Loss: 0.0099(0.0608) Grad: 0.2288  
Epoch: [1][30780/51233] Data 0.310 (0.311) Elapsed 273m 42s (remain 181m 52s) Loss: 0.0763(0.0608) Grad: 1.1762  
Epoch: [1][30800/51233] Data 0.301 (0.311) Elapsed 273m 53s (remain 181m 41s) Loss: 0.0071(0.0608) Grad: 0.0940  
Epoch: [1][30820/51233] Data 0.318 (0.311) Elapsed 274m 4s (remain 181m 30s) Loss: 0.3194(0.0608) Grad: 1.7666  
Epoch: [1][30840/51233] Data 0.309 (0.311) Elapsed 274m 15s (remain 181m 20s) Loss: 0.0444(0.0608) Grad: 0.5492  
Epoch: [1][30860/51233] Data 0.316 (0.311) Elapsed 274m 25s (remain 181m 9s) Loss: 0.0516(0.0608) Grad: 0.5884  
Epoch: [1][30880/51233] Data 0.310 (0.311) Elapsed 274m 36s (remain 180m 58s) Loss: 0.0053(0.0607) Grad: 0.1481  
Epoch: [1][30900/51233] Data 0.317 (0.311) Elapsed 274m 47s (remain 180m 48s) Loss: 0.2297(0.0607) Grad: 1.9019  
Epoch: [1][30920/51233] Data 0.314 (0.311) Elapsed 274m 57s (remain 180m 37s) Loss: 0.0009(0.0607) Grad: 0.0121  
Epoch: [1][30940/51233] Data 0.317 (0.311) Elapsed 275m 8s (remain 180m 26s) Loss: 0.0218(0.0607) Grad: 0.2392  
Epoch: [1][30960/51233] Data 0.294 (0.311) Elapsed 275m 19s (remain 180m 16s) Loss: 0.4423(0.0607) Grad: 2.2782  
Epoch: [1][30980/51233] Data 0.310 (0.311) Elapsed 275m 29s (remain 180m 5s) Loss: 0.0057(0.0607) Grad: 0.0589  
Epoch: [1][31000/51233] Data 0.318 (0.311) Elapsed 275m 40s (remain 179m 54s) Loss: 0.0370(0.0607) Grad: 0.5204  
Epoch: [1][31020/51233] Data 0.306 (0.311) Elapsed 275m 51s (remain 179m 44s) Loss: 0.0094(0.0607) Grad: 0.1069  
Epoch: [1][31040/51233] Data 0.312 (0.311) Elapsed 276m 1s (remain 179m 33s) Loss: 0.0063(0.0607) Grad: 0.0649  
Epoch: [1][31060/51233] Data 0.318 (0.311) Elapsed 276m 12s (remain 179m 22s) Loss: 0.0164(0.0607) Grad: 0.3118  
Epoch: [1][31080/51233] Data 0.309 (0.311) Elapsed 276m 23s (remain 179m 12s) Loss: 0.0107(0.0607) Grad: 0.1532  
Epoch: [1][31100/51233] Data 0.309 (0.311) Elapsed 276m 33s (remain 179m 1s) Loss: 0.0161(0.0607) Grad: 0.4354  
Epoch: [1][31120/51233] Data 0.317 (0.311) Elapsed 276m 44s (remain 178m 50s) Loss: 0.0026(0.0607) Grad: 0.0296  
Epoch: [1][31140/51233] Data 0.312 (0.311) Elapsed 276m 55s (remain 178m 40s) Loss: 0.0093(0.0607) Grad: 0.2789  
Epoch: [1][31160/51233] Data 0.309 (0.311) Elapsed 277m 6s (remain 178m 29s) Loss: 0.0006(0.0607) Grad: 0.0065  
Epoch: [1][31180/51233] Data 0.316 (0.311) Elapsed 277m 16s (remain 178m 18s) Loss: 0.0284(0.0607) Grad: 0.4862  
Epoch: [1][31200/51233] Data 0.307 (0.311) Elapsed 277m 27s (remain 178m 8s) Loss: 0.2250(0.0607) Grad: 1.0755  
Epoch: [1][31220/51233] Data 0.317 (0.311) Elapsed 277m 38s (remain 177m 57s) Loss: 0.0225(0.0607) Grad: 0.3668  
Epoch: [1][31240/51233] Data 0.308 (0.311) Elapsed 277m 48s (remain 177m 46s) Loss: 0.0101(0.0607) Grad: 0.1699  
Epoch: [1][31260/51233] Data 0.311 (0.311) Elapsed 277m 59s (remain 177m 36s) Loss: 0.0027(0.0607) Grad: 0.0517  
Epoch: [1][31280/51233] Data 0.317 (0.311) Elapsed 278m 10s (remain 177m 25s) Loss: 0.0269(0.0607) Grad: 0.4764  
Epoch: [1][31300/51233] Data 0.307 (0.311) Elapsed 278m 20s (remain 177m 14s) Loss: 0.0331(0.0607) Grad: 0.6944  
Epoch: [1][31320/51233] Data 0.308 (0.311) Elapsed 278m 31s (remain 177m 4s) Loss: 0.2072(0.0607) Grad: 1.6750  
Epoch: [1][31340/51233] Data 0.306 (0.311) Elapsed 278m 42s (remain 176m 53s) Loss: 0.0639(0.0606) Grad: 1.5566  
Epoch: [1][31360/51233] Data 0.317 (0.311) Elapsed 278m 52s (remain 176m 42s) Loss: 0.0730(0.0606) Grad: 1.5827  
Epoch: [1][31380/51233] Data 0.306 (0.311) Elapsed 279m 3s (remain 176m 32s) Loss: 0.0153(0.0606) Grad: 0.2970  
Epoch: [1][31400/51233] Data 0.312 (0.311) Elapsed 279m 14s (remain 176m 21s) Loss: 0.1656(0.0606) Grad: 2.4365  
Epoch: [1][31420/51233] Data 0.318 (0.311) Elapsed 279m 24s (remain 176m 10s) Loss: 0.0971(0.0606) Grad: 1.3498  
Epoch: [1][31440/51233] Data 0.318 (0.311) Elapsed 279m 35s (remain 176m 0s) Loss: 0.0053(0.0606) Grad: 0.0995  
Epoch: [1][31460/51233] Data 0.312 (0.311) Elapsed 279m 46s (remain 175m 49s) Loss: 0.0207(0.0606) Grad: 0.3323  
Epoch: [1][31480/51233] Data 0.318 (0.311) Elapsed 279m 57s (remain 175m 38s) Loss: 0.0120(0.0606) Grad: 0.2232  
Epoch: [1][31500/51233] Data 0.318 (0.311) Elapsed 280m 7s (remain 175m 28s) Loss: 0.0071(0.0606) Grad: 0.0929  
Epoch: [1][31520/51233] Data 0.318 (0.311) Elapsed 280m 18s (remain 175m 17s) Loss: 0.0057(0.0606) Grad: 0.0995  
Epoch: [1][31540/51233] Data 0.317 (0.311) Elapsed 280m 29s (remain 175m 6s) Loss: 0.0192(0.0606) Grad: 0.4377  
Epoch: [1][31560/51233] Data 0.317 (0.311) Elapsed 280m 39s (remain 174m 56s) Loss: 0.0007(0.0605) Grad: 0.0105  
Epoch: [1][31580/51233] Data 0.318 (0.311) Elapsed 280m 50s (remain 174m 45s) Loss: 0.0674(0.0605) Grad: 1.5799  
Epoch: [1][31600/51233] Data 0.316 (0.311) Elapsed 281m 1s (remain 174m 34s) Loss: 0.0139(0.0605) Grad: 0.3630  
Epoch: [1][31620/51233] Data 0.298 (0.311) Elapsed 281m 11s (remain 174m 24s) Loss: 0.0977(0.0605) Grad: 1.6509  
Epoch: [1][31640/51233] Data 0.307 (0.311) Elapsed 281m 22s (remain 174m 13s) Loss: 0.0092(0.0605) Grad: 0.1080  
Epoch: [1][31660/51233] Data 0.303 (0.311) Elapsed 281m 33s (remain 174m 2s) Loss: 0.0104(0.0606) Grad: 0.2421  
Epoch: [1][31680/51233] Data 0.313 (0.311) Elapsed 281m 43s (remain 173m 52s) Loss: 0.0017(0.0606) Grad: 0.0244  
Epoch: [1][31700/51233] Data 0.307 (0.311) Elapsed 281m 54s (remain 173m 41s) Loss: 0.0625(0.0606) Grad: 1.5809  
Epoch: [1][31720/51233] Data 0.317 (0.311) Elapsed 282m 5s (remain 173m 30s) Loss: 0.2075(0.0606) Grad: 2.3750  
Epoch: [1][31740/51233] Data 0.317 (0.311) Elapsed 282m 15s (remain 173m 20s) Loss: 0.0123(0.0606) Grad: 0.1973  
Epoch: [1][31760/51233] Data 0.317 (0.311) Elapsed 282m 26s (remain 173m 9s) Loss: 0.2812(0.0605) Grad: 1.5006  
Epoch: [1][31780/51233] Data 0.317 (0.311) Elapsed 282m 37s (remain 172m 58s) Loss: 0.0089(0.0606) Grad: 0.1275  
Epoch: [1][31800/51233] Data 0.318 (0.311) Elapsed 282m 48s (remain 172m 48s) Loss: 0.1114(0.0606) Grad: 0.9145  
Epoch: [1][31820/51233] Data 0.309 (0.311) Elapsed 282m 58s (remain 172m 37s) Loss: 0.0213(0.0606) Grad: 0.3946  
Epoch: [1][31840/51233] Data 0.317 (0.311) Elapsed 283m 9s (remain 172m 26s) Loss: 0.0141(0.0606) Grad: 0.2286  
Epoch: [1][31860/51233] Data 0.307 (0.311) Elapsed 283m 20s (remain 172m 16s) Loss: 0.0061(0.0606) Grad: 0.1236  
Epoch: [1][31880/51233] Data 0.309 (0.311) Elapsed 283m 30s (remain 172m 5s) Loss: 0.0148(0.0605) Grad: 0.4168  
Epoch: [1][31900/51233] Data 0.306 (0.311) Elapsed 283m 41s (remain 171m 55s) Loss: 0.0032(0.0606) Grad: 0.0427  
Epoch: [1][31920/51233] Data 0.308 (0.311) Elapsed 283m 52s (remain 171m 44s) Loss: 0.0464(0.0606) Grad: 0.9012  
Epoch: [1][31940/51233] Data 0.317 (0.311) Elapsed 284m 2s (remain 171m 33s) Loss: 0.0128(0.0606) Grad: 0.1712  
Epoch: [1][31960/51233] Data 0.300 (0.311) Elapsed 284m 13s (remain 171m 23s) Loss: 0.0677(0.0606) Grad: 1.0694  
Epoch: [1][31980/51233] Data 0.307 (0.311) Elapsed 284m 24s (remain 171m 12s) Loss: 0.0128(0.0606) Grad: 0.1352  
Epoch: [1][32000/51233] Data 0.317 (0.311) Elapsed 284m 34s (remain 171m 1s) Loss: 0.0563(0.0605) Grad: 0.3909  
Epoch: [1][32020/51233] Data 0.300 (0.311) Elapsed 284m 45s (remain 170m 51s) Loss: 0.0427(0.0605) Grad: 1.3903  
Epoch: [1][32040/51233] Data 0.303 (0.311) Elapsed 284m 56s (remain 170m 40s) Loss: 0.0120(0.0605) Grad: 0.2349  
Epoch: [1][32060/51233] Data 0.314 (0.311) Elapsed 285m 6s (remain 170m 29s) Loss: 0.3751(0.0605) Grad: 2.3866  
Epoch: [1][32080/51233] Data 0.304 (0.311) Elapsed 285m 17s (remain 170m 19s) Loss: 0.1214(0.0605) Grad: 1.3023  
Epoch: [1][32100/51233] Data 0.307 (0.311) Elapsed 285m 28s (remain 170m 8s) Loss: 0.0159(0.0605) Grad: 0.4087  
Epoch: [1][32120/51233] Data 0.318 (0.311) Elapsed 285m 39s (remain 169m 57s) Loss: 0.2771(0.0605) Grad: 3.1357  
Epoch: [1][32140/51233] Data 0.311 (0.311) Elapsed 285m 49s (remain 169m 47s) Loss: 0.0062(0.0605) Grad: 0.1687  
Epoch: [1][32160/51233] Data 0.309 (0.311) Elapsed 286m 0s (remain 169m 36s) Loss: 0.0178(0.0605) Grad: 0.2404  
Epoch: [1][32180/51233] Data 0.318 (0.311) Elapsed 286m 11s (remain 169m 25s) Loss: 0.0298(0.0604) Grad: 0.5155  
Epoch: [1][32200/51233] Data 0.314 (0.311) Elapsed 286m 21s (remain 169m 15s) Loss: 0.0078(0.0604) Grad: 0.2001  
Epoch: [1][32220/51233] Data 0.308 (0.311) Elapsed 286m 32s (remain 169m 4s) Loss: 0.0317(0.0604) Grad: 0.7287  
Epoch: [1][32240/51233] Data 0.300 (0.311) Elapsed 286m 43s (remain 168m 53s) Loss: 0.0226(0.0604) Grad: 0.2832  
Epoch: [1][32260/51233] Data 0.318 (0.311) Elapsed 286m 53s (remain 168m 43s) Loss: 0.0042(0.0604) Grad: 0.0578  
Epoch: [1][32280/51233] Data 0.304 (0.311) Elapsed 287m 4s (remain 168m 32s) Loss: 0.0186(0.0604) Grad: 0.3040  
Epoch: [1][32300/51233] Data 0.315 (0.311) Elapsed 287m 15s (remain 168m 21s) Loss: 0.0699(0.0604) Grad: 1.2521  
Epoch: [1][32320/51233] Data 0.302 (0.311) Elapsed 287m 25s (remain 168m 11s) Loss: 0.0059(0.0604) Grad: 0.1304  
Epoch: [1][32340/51233] Data 0.316 (0.311) Elapsed 287m 36s (remain 168m 0s) Loss: 0.0734(0.0604) Grad: 1.3127  
Epoch: [1][32360/51233] Data 0.307 (0.311) Elapsed 287m 47s (remain 167m 49s) Loss: 0.0111(0.0604) Grad: 0.2924  
Epoch: [1][32380/51233] Data 0.310 (0.311) Elapsed 287m 57s (remain 167m 39s) Loss: 0.0649(0.0603) Grad: 0.9822  
Epoch: [1][32400/51233] Data 0.304 (0.311) Elapsed 288m 8s (remain 167m 28s) Loss: 0.0035(0.0604) Grad: 0.0567  
Epoch: [1][32420/51233] Data 0.305 (0.311) Elapsed 288m 19s (remain 167m 17s) Loss: 0.0011(0.0604) Grad: 0.0140  
Epoch: [1][32440/51233] Data 0.308 (0.311) Elapsed 288m 29s (remain 167m 6s) Loss: 0.0049(0.0603) Grad: 0.0752  
Epoch: [1][32460/51233] Data 0.316 (0.311) Elapsed 288m 40s (remain 166m 56s) Loss: 0.0023(0.0603) Grad: 0.0611  
Epoch: [1][32480/51233] Data 0.313 (0.311) Elapsed 288m 51s (remain 166m 45s) Loss: 0.0047(0.0603) Grad: 0.0606  
Epoch: [1][32500/51233] Data 0.306 (0.311) Elapsed 289m 1s (remain 166m 34s) Loss: 0.0281(0.0603) Grad: 0.4626  
Epoch: [1][32520/51233] Data 0.317 (0.311) Elapsed 289m 12s (remain 166m 24s) Loss: 0.0209(0.0603) Grad: 0.4034  
Epoch: [1][32540/51233] Data 0.318 (0.311) Elapsed 289m 23s (remain 166m 13s) Loss: 0.0454(0.0603) Grad: 1.0868  
Epoch: [1][32560/51233] Data 0.317 (0.311) Elapsed 289m 33s (remain 166m 2s) Loss: 0.0432(0.0603) Grad: 0.7317  
Epoch: [1][32580/51233] Data 0.316 (0.311) Elapsed 289m 44s (remain 165m 52s) Loss: 0.0153(0.0603) Grad: 0.2846  
Epoch: [1][32600/51233] Data 0.316 (0.311) Elapsed 289m 55s (remain 165m 41s) Loss: 0.0616(0.0603) Grad: 1.0863  
Epoch: [1][32620/51233] Data 0.310 (0.311) Elapsed 290m 5s (remain 165m 30s) Loss: 0.0275(0.0603) Grad: 0.4279  
Epoch: [1][32640/51233] Data 0.310 (0.311) Elapsed 290m 16s (remain 165m 20s) Loss: 0.2279(0.0603) Grad: 2.2729  
Epoch: [1][32660/51233] Data 0.308 (0.311) Elapsed 290m 27s (remain 165m 9s) Loss: 0.1341(0.0603) Grad: 1.8917  
Epoch: [1][32680/51233] Data 0.317 (0.311) Elapsed 290m 37s (remain 164m 58s) Loss: 0.0045(0.0603) Grad: 0.0509  
Epoch: [1][32700/51233] Data 0.318 (0.311) Elapsed 290m 48s (remain 164m 48s) Loss: 0.0080(0.0603) Grad: 0.1952  
Epoch: [1][32720/51233] Data 0.308 (0.311) Elapsed 290m 59s (remain 164m 37s) Loss: 0.0143(0.0602) Grad: 0.2353  
Epoch: [1][32740/51233] Data 0.309 (0.311) Elapsed 291m 9s (remain 164m 26s) Loss: 0.1239(0.0602) Grad: 0.7460  
Epoch: [1][32760/51233] Data 0.305 (0.311) Elapsed 291m 20s (remain 164m 16s) Loss: 0.0119(0.0602) Grad: 0.2020  
Epoch: [1][32780/51233] Data 0.316 (0.311) Elapsed 291m 31s (remain 164m 5s) Loss: 0.0179(0.0602) Grad: 0.4280  
Epoch: [1][32800/51233] Data 0.293 (0.311) Elapsed 291m 41s (remain 163m 54s) Loss: 0.0080(0.0602) Grad: 0.1143  
Epoch: [1][32820/51233] Data 0.307 (0.311) Elapsed 291m 52s (remain 163m 44s) Loss: 0.0454(0.0602) Grad: 1.4335  
Epoch: [1][32840/51233] Data 0.318 (0.311) Elapsed 292m 3s (remain 163m 33s) Loss: 0.0017(0.0602) Grad: 0.0178  
Epoch: [1][32860/51233] Data 0.305 (0.311) Elapsed 292m 13s (remain 163m 22s) Loss: 0.0054(0.0602) Grad: 0.0852  
Epoch: [1][32880/51233] Data 0.308 (0.311) Elapsed 292m 24s (remain 163m 12s) Loss: 0.0071(0.0602) Grad: 0.1125  
Epoch: [1][32900/51233] Data 0.298 (0.311) Elapsed 292m 35s (remain 163m 1s) Loss: 0.0065(0.0602) Grad: 0.1418  
Epoch: [1][32920/51233] Data 0.305 (0.311) Elapsed 292m 46s (remain 162m 50s) Loss: 0.2687(0.0602) Grad: 1.4945  
Epoch: [1][32940/51233] Data 0.316 (0.311) Elapsed 292m 56s (remain 162m 40s) Loss: 0.0854(0.0602) Grad: 0.8430  
Epoch: [1][32960/51233] Data 0.305 (0.311) Elapsed 293m 7s (remain 162m 29s) Loss: 0.0226(0.0602) Grad: 0.2749  
Epoch: [1][32980/51233] Data 0.307 (0.311) Elapsed 293m 18s (remain 162m 18s) Loss: 0.1719(0.0602) Grad: 1.4215  
Epoch: [1][33000/51233] Data 0.307 (0.311) Elapsed 293m 28s (remain 162m 8s) Loss: 0.0022(0.0602) Grad: 0.0318  
Epoch: [1][33020/51233] Data 0.297 (0.311) Elapsed 293m 39s (remain 161m 57s) Loss: 0.0018(0.0602) Grad: 0.0207  
Epoch: [1][33040/51233] Data 0.307 (0.311) Elapsed 293m 50s (remain 161m 46s) Loss: 0.0030(0.0602) Grad: 0.0546  
Epoch: [1][33060/51233] Data 0.317 (0.311) Elapsed 294m 0s (remain 161m 36s) Loss: 0.0063(0.0602) Grad: 0.1623  
Epoch: [1][33080/51233] Data 0.307 (0.311) Elapsed 294m 11s (remain 161m 25s) Loss: 0.0014(0.0602) Grad: 0.0117  
Epoch: [1][33100/51233] Data 0.295 (0.311) Elapsed 294m 22s (remain 161m 14s) Loss: 0.0042(0.0602) Grad: 0.0693  
Epoch: [1][33120/51233] Data 0.308 (0.311) Elapsed 294m 32s (remain 161m 4s) Loss: 0.0271(0.0602) Grad: 0.7434  
Epoch: [1][33140/51233] Data 0.309 (0.311) Elapsed 294m 43s (remain 160m 53s) Loss: 0.1209(0.0602) Grad: 1.7439  
Epoch: [1][33160/51233] Data 0.298 (0.311) Elapsed 294m 54s (remain 160m 42s) Loss: 0.0198(0.0602) Grad: 0.3189  
Epoch: [1][33180/51233] Data 0.314 (0.311) Elapsed 295m 4s (remain 160m 32s) Loss: 0.0050(0.0601) Grad: 0.1264  
Epoch: [1][33200/51233] Data 0.307 (0.311) Elapsed 295m 15s (remain 160m 21s) Loss: 0.0349(0.0601) Grad: 0.6437  
Epoch: [1][33220/51233] Data 0.308 (0.311) Elapsed 295m 26s (remain 160m 10s) Loss: 0.3658(0.0601) Grad: 5.1530  
Epoch: [1][33240/51233] Data 0.314 (0.311) Elapsed 295m 37s (remain 160m 0s) Loss: 0.0813(0.0602) Grad: 1.0748  
Epoch: [1][33260/51233] Data 0.309 (0.311) Elapsed 295m 47s (remain 159m 49s) Loss: 0.0099(0.0602) Grad: 0.1434  
Epoch: [1][33280/51233] Data 0.306 (0.311) Elapsed 295m 58s (remain 159m 39s) Loss: 0.0251(0.0601) Grad: 0.3538  
Epoch: [1][33300/51233] Data 0.318 (0.311) Elapsed 296m 9s (remain 159m 28s) Loss: 0.0574(0.0602) Grad: 0.8981  
Epoch: [1][33320/51233] Data 0.312 (0.311) Elapsed 296m 19s (remain 159m 17s) Loss: 0.0859(0.0601) Grad: 0.8973  
Epoch: [1][33340/51233] Data 0.316 (0.311) Elapsed 296m 30s (remain 159m 7s) Loss: 0.0015(0.0601) Grad: 0.0260  
Epoch: [1][33360/51233] Data 0.314 (0.311) Elapsed 296m 41s (remain 158m 56s) Loss: 0.0062(0.0601) Grad: 0.1115  
Epoch: [1][33380/51233] Data 0.282 (0.311) Elapsed 296m 51s (remain 158m 45s) Loss: 0.0343(0.0601) Grad: 0.3063  
Epoch: [1][33400/51233] Data 0.309 (0.311) Elapsed 297m 2s (remain 158m 35s) Loss: 0.0008(0.0601) Grad: 0.0076  
Epoch: [1][33420/51233] Data 0.318 (0.311) Elapsed 297m 13s (remain 158m 24s) Loss: 0.0238(0.0601) Grad: 0.5368  
Epoch: [1][33440/51233] Data 0.298 (0.311) Elapsed 297m 23s (remain 158m 13s) Loss: 0.0067(0.0601) Grad: 0.0802  
Epoch: [1][33460/51233] Data 0.319 (0.311) Elapsed 297m 34s (remain 158m 3s) Loss: 0.0505(0.0601) Grad: 0.8409  
Epoch: [1][33480/51233] Data 0.317 (0.311) Elapsed 297m 45s (remain 157m 52s) Loss: 0.0154(0.0601) Grad: 0.2928  
Epoch: [1][33500/51233] Data 0.318 (0.311) Elapsed 297m 55s (remain 157m 41s) Loss: 0.1340(0.0601) Grad: 2.1090  
Epoch: [1][33520/51233] Data 0.307 (0.311) Elapsed 298m 6s (remain 157m 31s) Loss: 0.0038(0.0601) Grad: 0.1057  
Epoch: [1][33540/51233] Data 0.307 (0.311) Elapsed 298m 17s (remain 157m 20s) Loss: 0.0178(0.0600) Grad: 0.2791  
Epoch: [1][33560/51233] Data 0.308 (0.311) Elapsed 298m 28s (remain 157m 9s) Loss: 0.0009(0.0600) Grad: 0.0137  
Epoch: [1][33580/51233] Data 0.314 (0.311) Elapsed 298m 38s (remain 156m 59s) Loss: 0.0177(0.0600) Grad: 0.4776  
Epoch: [1][33600/51233] Data 0.310 (0.311) Elapsed 298m 49s (remain 156m 48s) Loss: 0.0292(0.0600) Grad: 0.4695  
Epoch: [1][33620/51233] Data 0.306 (0.311) Elapsed 299m 0s (remain 156m 37s) Loss: 0.0240(0.0600) Grad: 0.3803  
Epoch: [1][33640/51233] Data 0.317 (0.311) Elapsed 299m 10s (remain 156m 27s) Loss: 0.0201(0.0600) Grad: 0.2339  
Epoch: [1][33660/51233] Data 0.316 (0.311) Elapsed 299m 21s (remain 156m 16s) Loss: 0.0011(0.0600) Grad: 0.0163  
Epoch: [1][33680/51233] Data 0.317 (0.311) Elapsed 299m 32s (remain 156m 5s) Loss: 0.0002(0.0600) Grad: 0.0030  
Epoch: [1][33700/51233] Data 0.318 (0.311) Elapsed 299m 42s (remain 155m 55s) Loss: 0.0031(0.0600) Grad: 0.0518  
Epoch: [1][33720/51233] Data 0.307 (0.311) Elapsed 299m 53s (remain 155m 44s) Loss: 0.1585(0.0600) Grad: 4.3027  
Epoch: [1][33740/51233] Data 0.317 (0.311) Elapsed 300m 4s (remain 155m 33s) Loss: 0.0017(0.0600) Grad: 0.0197  
Epoch: [1][33760/51233] Data 0.317 (0.311) Elapsed 300m 14s (remain 155m 23s) Loss: 0.2206(0.0600) Grad: 1.4633  
Epoch: [1][33780/51233] Data 0.316 (0.311) Elapsed 300m 25s (remain 155m 12s) Loss: 0.0049(0.0600) Grad: 0.0604  
Epoch: [1][33800/51233] Data 0.307 (0.311) Elapsed 300m 36s (remain 155m 1s) Loss: 0.0108(0.0600) Grad: 0.1801  
Epoch: [1][33820/51233] Data 0.309 (0.311) Elapsed 300m 46s (remain 154m 51s) Loss: 0.0023(0.0600) Grad: 0.0368  
Epoch: [1][33840/51233] Data 0.310 (0.311) Elapsed 300m 57s (remain 154m 40s) Loss: 0.0310(0.0599) Grad: 0.7931  
Epoch: [1][33860/51233] Data 0.313 (0.311) Elapsed 301m 8s (remain 154m 29s) Loss: 0.0841(0.0599) Grad: 1.7497  
Epoch: [1][33880/51233] Data 0.310 (0.311) Elapsed 301m 18s (remain 154m 19s) Loss: 0.0572(0.0600) Grad: 0.9678  
Epoch: [1][33900/51233] Data 0.300 (0.311) Elapsed 301m 29s (remain 154m 8s) Loss: 0.0316(0.0599) Grad: 0.7537  
Epoch: [1][33920/51233] Data 0.318 (0.311) Elapsed 301m 40s (remain 153m 57s) Loss: 0.0017(0.0599) Grad: 0.0305  
Epoch: [1][33940/51233] Data 0.308 (0.311) Elapsed 301m 51s (remain 153m 47s) Loss: 0.0010(0.0599) Grad: 0.0139  
Epoch: [1][33960/51233] Data 0.306 (0.311) Elapsed 302m 1s (remain 153m 36s) Loss: 0.0201(0.0599) Grad: 0.4083  
Epoch: [1][33980/51233] Data 0.318 (0.311) Elapsed 302m 12s (remain 153m 25s) Loss: 0.0369(0.0599) Grad: 0.4568  
Epoch: [1][34000/51233] Data 0.307 (0.311) Elapsed 302m 23s (remain 153m 15s) Loss: 0.0803(0.0599) Grad: 0.7245  
Epoch: [1][34020/51233] Data 0.318 (0.311) Elapsed 302m 33s (remain 153m 4s) Loss: 0.0199(0.0599) Grad: 0.3205  
Epoch: [1][34040/51233] Data 0.296 (0.311) Elapsed 302m 44s (remain 152m 53s) Loss: 0.0920(0.0599) Grad: 1.0254  
Epoch: [1][34060/51233] Data 0.309 (0.311) Elapsed 302m 55s (remain 152m 43s) Loss: 0.0016(0.0599) Grad: 0.0594  
Epoch: [1][34080/51233] Data 0.317 (0.311) Elapsed 303m 5s (remain 152m 32s) Loss: 0.0109(0.0599) Grad: 0.1530  
Epoch: [1][34100/51233] Data 0.316 (0.311) Elapsed 303m 16s (remain 152m 21s) Loss: 0.0114(0.0599) Grad: 0.1250  
Epoch: [1][34120/51233] Data 0.298 (0.311) Elapsed 303m 27s (remain 152m 11s) Loss: 0.0153(0.0599) Grad: 0.3238  
Epoch: [1][34140/51233] Data 0.307 (0.311) Elapsed 303m 37s (remain 152m 0s) Loss: 0.1992(0.0599) Grad: 2.2203  
Epoch: [1][34160/51233] Data 0.318 (0.311) Elapsed 303m 48s (remain 151m 49s) Loss: 0.0455(0.0599) Grad: 0.8157  
Epoch: [1][34180/51233] Data 0.308 (0.311) Elapsed 303m 59s (remain 151m 39s) Loss: 0.0023(0.0598) Grad: 0.0451  
Epoch: [1][34200/51233] Data 0.318 (0.311) Elapsed 304m 9s (remain 151m 28s) Loss: 0.0560(0.0598) Grad: 1.1289  
Epoch: [1][34220/51233] Data 0.317 (0.311) Elapsed 304m 20s (remain 151m 17s) Loss: 0.0260(0.0598) Grad: 0.3502  
Epoch: [1][34240/51233] Data 0.310 (0.311) Elapsed 304m 31s (remain 151m 7s) Loss: 0.0046(0.0598) Grad: 0.1076  
Epoch: [1][34260/51233] Data 0.306 (0.311) Elapsed 304m 42s (remain 150m 56s) Loss: 0.0154(0.0598) Grad: 0.1956  
Epoch: [1][34280/51233] Data 0.318 (0.311) Elapsed 304m 52s (remain 150m 45s) Loss: 0.0477(0.0598) Grad: 0.5311  
Epoch: [1][34300/51233] Data 0.318 (0.311) Elapsed 305m 3s (remain 150m 35s) Loss: 0.0093(0.0598) Grad: 0.1295  
Epoch: [1][34320/51233] Data 0.318 (0.311) Elapsed 305m 14s (remain 150m 24s) Loss: 0.0662(0.0598) Grad: 1.3117  
Epoch: [1][34340/51233] Data 0.310 (0.311) Elapsed 305m 24s (remain 150m 13s) Loss: 0.3676(0.0598) Grad: 3.0545  
Epoch: [1][34360/51233] Data 0.316 (0.311) Elapsed 305m 35s (remain 150m 3s) Loss: 0.0108(0.0598) Grad: 0.1579  
Epoch: [1][34380/51233] Data 0.308 (0.311) Elapsed 305m 46s (remain 149m 52s) Loss: 0.0113(0.0598) Grad: 0.1420  
Epoch: [1][34400/51233] Data 0.317 (0.311) Elapsed 305m 56s (remain 149m 41s) Loss: 0.0014(0.0598) Grad: 0.0144  
Epoch: [1][34420/51233] Data 0.317 (0.311) Elapsed 306m 7s (remain 149m 31s) Loss: 0.0271(0.0598) Grad: 0.5621  
Epoch: [1][34440/51233] Data 0.318 (0.311) Elapsed 306m 18s (remain 149m 20s) Loss: 0.0710(0.0598) Grad: 0.6222  
Epoch: [1][34460/51233] Data 0.321 (0.311) Elapsed 306m 28s (remain 149m 9s) Loss: 0.0170(0.0598) Grad: 0.2785  
Epoch: [1][34480/51233] Data 0.303 (0.311) Elapsed 306m 39s (remain 148m 59s) Loss: 0.1636(0.0598) Grad: 0.9727  
Epoch: [1][34500/51233] Data 0.295 (0.311) Elapsed 306m 50s (remain 148m 48s) Loss: 0.2660(0.0598) Grad: 1.6382  
Epoch: [1][34520/51233] Data 0.304 (0.311) Elapsed 307m 0s (remain 148m 37s) Loss: 0.0099(0.0598) Grad: 0.1329  
Epoch: [1][34540/51233] Data 0.304 (0.311) Elapsed 307m 11s (remain 148m 27s) Loss: 0.0009(0.0597) Grad: 0.0138  
Epoch: [1][34560/51233] Data 0.309 (0.311) Elapsed 307m 22s (remain 148m 16s) Loss: 0.1181(0.0597) Grad: 0.9423  
Epoch: [1][34580/51233] Data 0.306 (0.311) Elapsed 307m 33s (remain 148m 5s) Loss: 0.0137(0.0597) Grad: 0.2805  
Epoch: [1][34600/51233] Data 0.308 (0.311) Elapsed 307m 43s (remain 147m 55s) Loss: 0.0088(0.0597) Grad: 0.1581  
Epoch: [1][34620/51233] Data 0.319 (0.311) Elapsed 307m 54s (remain 147m 44s) Loss: 0.0162(0.0597) Grad: 0.3493  
Epoch: [1][34640/51233] Data 0.309 (0.311) Elapsed 308m 5s (remain 147m 33s) Loss: 0.0037(0.0597) Grad: 0.0629  
Epoch: [1][34660/51233] Data 0.317 (0.311) Elapsed 308m 15s (remain 147m 23s) Loss: 0.0024(0.0597) Grad: 0.0380  
Epoch: [1][34680/51233] Data 0.317 (0.311) Elapsed 308m 26s (remain 147m 12s) Loss: 0.0074(0.0597) Grad: 0.0806  
Epoch: [1][34700/51233] Data 0.308 (0.311) Elapsed 308m 37s (remain 147m 1s) Loss: 0.0231(0.0597) Grad: 0.2156  
Epoch: [1][34720/51233] Data 0.314 (0.311) Elapsed 308m 47s (remain 146m 51s) Loss: 0.0125(0.0597) Grad: 0.1256  
Epoch: [1][34740/51233] Data 0.318 (0.311) Elapsed 308m 58s (remain 146m 40s) Loss: 0.0205(0.0597) Grad: 0.3430  
Epoch: [1][34760/51233] Data 0.311 (0.311) Elapsed 309m 9s (remain 146m 29s) Loss: 0.0147(0.0597) Grad: 0.2147  
Epoch: [1][34780/51233] Data 0.316 (0.311) Elapsed 309m 19s (remain 146m 19s) Loss: 0.0096(0.0596) Grad: 0.1833  
Epoch: [1][34800/51233] Data 0.312 (0.311) Elapsed 309m 30s (remain 146m 8s) Loss: 0.0867(0.0596) Grad: 1.3517  
Epoch: [1][34820/51233] Data 0.318 (0.311) Elapsed 309m 41s (remain 145m 57s) Loss: 0.0086(0.0596) Grad: 0.2231  
Epoch: [1][34840/51233] Data 0.318 (0.311) Elapsed 309m 51s (remain 145m 47s) Loss: 0.0020(0.0596) Grad: 0.0525  
Epoch: [1][34860/51233] Data 0.307 (0.311) Elapsed 310m 2s (remain 145m 36s) Loss: 0.0049(0.0596) Grad: 0.0700  
Epoch: [1][34880/51233] Data 0.313 (0.311) Elapsed 310m 13s (remain 145m 25s) Loss: 0.0979(0.0596) Grad: 1.4655  
Epoch: [1][34900/51233] Data 0.317 (0.311) Elapsed 310m 24s (remain 145m 15s) Loss: 0.0134(0.0596) Grad: 0.2222  
Epoch: [1][34920/51233] Data 0.305 (0.311) Elapsed 310m 34s (remain 145m 4s) Loss: 0.0035(0.0596) Grad: 0.0614  
Epoch: [1][34940/51233] Data 0.306 (0.311) Elapsed 310m 45s (remain 144m 53s) Loss: 0.0007(0.0596) Grad: 0.0061  
Epoch: [1][34960/51233] Data 0.317 (0.311) Elapsed 310m 56s (remain 144m 43s) Loss: 0.0724(0.0596) Grad: 1.1902  
Epoch: [1][34980/51233] Data 0.306 (0.311) Elapsed 311m 6s (remain 144m 32s) Loss: 0.1229(0.0596) Grad: 1.6797  
Epoch: [1][35000/51233] Data 0.312 (0.311) Elapsed 311m 17s (remain 144m 21s) Loss: 0.0031(0.0596) Grad: 0.0306  
Epoch: [1][35020/51233] Data 0.309 (0.311) Elapsed 311m 28s (remain 144m 11s) Loss: 0.3032(0.0596) Grad: 1.9794  
Epoch: [1][35040/51233] Data 0.316 (0.311) Elapsed 311m 38s (remain 144m 0s) Loss: 0.0121(0.0596) Grad: 0.1582  
Epoch: [1][35060/51233] Data 0.317 (0.311) Elapsed 311m 49s (remain 143m 49s) Loss: 0.0031(0.0596) Grad: 0.0568  
Epoch: [1][35100/51233] Data 0.316 (0.311) Elapsed 312m 10s (remain 143m 28s) Loss: 0.0151(0.0596) Grad: 0.2146  
Epoch: [1][35120/51233] Data 0.307 (0.311) Elapsed 312m 21s (remain 143m 17s) Loss: 0.0050(0.0596) Grad: 0.0677  
Epoch: [1][35140/51233] Data 0.318 (0.311) Elapsed 312m 32s (remain 143m 7s) Loss: 0.0092(0.0596) Grad: 0.3300  
Epoch: [1][35160/51233] Data 0.318 (0.311) Elapsed 312m 42s (remain 142m 56s) Loss: 0.0156(0.0596) Grad: 0.3590  
Epoch: [1][35180/51233] Data 0.309 (0.311) Elapsed 312m 53s (remain 142m 45s) Loss: 0.0093(0.0595) Grad: 0.1518  
Epoch: [1][35200/51233] Data 0.311 (0.311) Elapsed 313m 4s (remain 142m 35s) Loss: 0.0010(0.0595) Grad: 0.0258  
Epoch: [1][35220/51233] Data 0.317 (0.311) Elapsed 313m 15s (remain 142m 24s) Loss: 0.0134(0.0596) Grad: 0.3354  
Epoch: [1][35240/51233] Data 0.316 (0.311) Elapsed 313m 25s (remain 142m 13s) Loss: 0.0174(0.0596) Grad: 0.2451  
Epoch: [1][35260/51233] Data 0.311 (0.311) Elapsed 313m 36s (remain 142m 3s) Loss: 0.0037(0.0596) Grad: 0.0419  
Epoch: [1][35280/51233] Data 0.303 (0.311) Elapsed 313m 47s (remain 141m 52s) Loss: 0.0028(0.0595) Grad: 0.0614  
Epoch: [1][35300/51233] Data 0.306 (0.311) Elapsed 313m 57s (remain 141m 41s) Loss: 0.0735(0.0595) Grad: 1.0470  
Epoch: [1][35320/51233] Data 0.308 (0.311) Elapsed 314m 8s (remain 141m 31s) Loss: 0.0068(0.0595) Grad: 0.1040  
Epoch: [1][35340/51233] Data 0.315 (0.311) Elapsed 314m 19s (remain 141m 20s) Loss: 0.0973(0.0595) Grad: 1.4182  
Epoch: [1][35360/51233] Data 0.311 (0.311) Elapsed 314m 29s (remain 141m 9s) Loss: 0.0383(0.0595) Grad: 0.8603  
Epoch: [1][35380/51233] Data 0.317 (0.311) Elapsed 314m 40s (remain 140m 59s) Loss: 0.0093(0.0595) Grad: 0.2300  
Epoch: [1][35400/51233] Data 0.318 (0.311) Elapsed 314m 51s (remain 140m 48s) Loss: 0.0090(0.0595) Grad: 0.1436  
Epoch: [1][35420/51233] Data 0.309 (0.311) Elapsed 315m 1s (remain 140m 37s) Loss: 0.0069(0.0595) Grad: 0.1472  
Epoch: [1][35440/51233] Data 0.310 (0.311) Elapsed 315m 12s (remain 140m 27s) Loss: 0.0086(0.0595) Grad: 0.2943  
Epoch: [1][35460/51233] Data 0.317 (0.311) Elapsed 315m 23s (remain 140m 16s) Loss: 0.0014(0.0595) Grad: 0.0225  
Epoch: [1][35480/51233] Data 0.313 (0.311) Elapsed 315m 33s (remain 140m 5s) Loss: 0.0408(0.0595) Grad: 0.4324  
Epoch: [1][35500/51233] Data 0.309 (0.311) Elapsed 315m 44s (remain 139m 55s) Loss: 0.0601(0.0595) Grad: 0.6411  
Epoch: [1][35520/51233] Data 0.318 (0.311) Elapsed 315m 55s (remain 139m 44s) Loss: 0.0068(0.0595) Grad: 0.0928  
Epoch: [1][35540/51233] Data 0.308 (0.311) Elapsed 316m 6s (remain 139m 33s) Loss: 0.0377(0.0594) Grad: 0.5792  
Epoch: [1][35560/51233] Data 0.317 (0.311) Elapsed 316m 16s (remain 139m 23s) Loss: 0.0116(0.0594) Grad: 0.1770  
Epoch: [1][35580/51233] Data 0.297 (0.311) Elapsed 316m 27s (remain 139m 12s) Loss: 0.0014(0.0594) Grad: 0.0211  
Epoch: [1][35600/51233] Data 0.292 (0.311) Elapsed 316m 38s (remain 139m 1s) Loss: 0.0020(0.0594) Grad: 0.0329  
Epoch: [1][35620/51233] Data 0.317 (0.311) Elapsed 316m 48s (remain 138m 51s) Loss: 0.0007(0.0594) Grad: 0.0125  
Epoch: [1][35640/51233] Data 0.313 (0.311) Elapsed 316m 59s (remain 138m 40s) Loss: 0.0789(0.0594) Grad: 0.6958  
Epoch: [1][35660/51233] Data 0.308 (0.311) Elapsed 317m 10s (remain 138m 29s) Loss: 0.0028(0.0594) Grad: 0.0507  
Epoch: [1][35680/51233] Data 0.308 (0.311) Elapsed 317m 20s (remain 138m 19s) Loss: 0.1177(0.0594) Grad: 1.6026  
Epoch: [1][35700/51233] Data 0.316 (0.311) Elapsed 317m 31s (remain 138m 8s) Loss: 0.0994(0.0594) Grad: 2.0466  
Epoch: [1][35720/51233] Data 0.304 (0.311) Elapsed 317m 42s (remain 137m 57s) Loss: 0.0046(0.0594) Grad: 0.0892  
Epoch: [1][35740/51233] Data 0.317 (0.311) Elapsed 317m 52s (remain 137m 47s) Loss: 0.0133(0.0594) Grad: 0.1829  
Epoch: [1][35760/51233] Data 0.317 (0.311) Elapsed 318m 3s (remain 137m 36s) Loss: 0.0439(0.0594) Grad: 0.5209  
Epoch: [1][35780/51233] Data 0.308 (0.311) Elapsed 318m 14s (remain 137m 25s) Loss: 0.1501(0.0594) Grad: 2.3278  
Epoch: [1][35800/51233] Data 0.308 (0.311) Elapsed 318m 24s (remain 137m 15s) Loss: 0.0140(0.0594) Grad: 0.3945  
Epoch: [1][35820/51233] Data 0.307 (0.311) Elapsed 318m 35s (remain 137m 4s) Loss: 0.0023(0.0594) Grad: 0.0620  
Epoch: [1][35840/51233] Data 0.305 (0.311) Elapsed 318m 46s (remain 136m 53s) Loss: 0.0089(0.0594) Grad: 0.1136  
Epoch: [1][35860/51233] Data 0.308 (0.311) Elapsed 318m 57s (remain 136m 43s) Loss: 0.0033(0.0594) Grad: 0.0594  
Epoch: [1][35880/51233] Data 0.317 (0.311) Elapsed 319m 7s (remain 136m 32s) Loss: 0.0123(0.0594) Grad: 0.1612  
Epoch: [1][35900/51233] Data 0.309 (0.311) Elapsed 319m 18s (remain 136m 21s) Loss: 0.0231(0.0594) Grad: 0.4023  
Epoch: [1][35920/51233] Data 0.319 (0.311) Elapsed 319m 29s (remain 136m 11s) Loss: 0.0050(0.0594) Grad: 0.0774  
Epoch: [1][35940/51233] Data 0.318 (0.311) Elapsed 319m 39s (remain 136m 0s) Loss: 0.0197(0.0594) Grad: 0.2861  
Epoch: [1][35960/51233] Data 0.317 (0.311) Elapsed 319m 50s (remain 135m 49s) Loss: 0.3580(0.0594) Grad: 1.6794  
Epoch: [1][35980/51233] Data 0.300 (0.311) Elapsed 320m 1s (remain 135m 39s) Loss: 0.0366(0.0594) Grad: 0.6247  
Epoch: [1][36000/51233] Data 0.295 (0.311) Elapsed 320m 11s (remain 135m 28s) Loss: 0.0254(0.0594) Grad: 0.3144  
Epoch: [1][36020/51233] Data 0.307 (0.311) Elapsed 320m 22s (remain 135m 17s) Loss: 0.0283(0.0594) Grad: 0.3037  
Epoch: [1][36040/51233] Data 0.317 (0.311) Elapsed 320m 33s (remain 135m 7s) Loss: 0.0196(0.0594) Grad: 0.2491  
Epoch: [1][36060/51233] Data 0.310 (0.311) Elapsed 320m 43s (remain 134m 56s) Loss: 0.2097(0.0594) Grad: 1.7788  
Epoch: [1][36080/51233] Data 0.317 (0.311) Elapsed 320m 54s (remain 134m 45s) Loss: 0.0025(0.0593) Grad: 0.0575  
Epoch: [1][36100/51233] Data 0.309 (0.311) Elapsed 321m 5s (remain 134m 35s) Loss: 0.0079(0.0594) Grad: 0.0879  
Epoch: [1][36120/51233] Data 0.298 (0.311) Elapsed 321m 15s (remain 134m 24s) Loss: 0.0096(0.0594) Grad: 0.2556  
Epoch: [1][36140/51233] Data 0.307 (0.311) Elapsed 321m 26s (remain 134m 13s) Loss: 0.2731(0.0594) Grad: 1.9264  
Epoch: [1][36160/51233] Data 0.301 (0.311) Elapsed 321m 37s (remain 134m 3s) Loss: 0.0068(0.0594) Grad: 0.1084  
Epoch: [1][36180/51233] Data 0.317 (0.311) Elapsed 321m 48s (remain 133m 52s) Loss: 0.0826(0.0593) Grad: 1.0322  
Epoch: [1][36200/51233] Data 0.317 (0.311) Elapsed 321m 58s (remain 133m 41s) Loss: 0.0481(0.0593) Grad: 0.4995  
Epoch: [1][36220/51233] Data 0.312 (0.311) Elapsed 322m 9s (remain 133m 31s) Loss: 0.0323(0.0593) Grad: 0.5526  
Epoch: [1][36240/51233] Data 0.308 (0.311) Elapsed 322m 20s (remain 133m 20s) Loss: 0.0279(0.0593) Grad: 0.3465  
Epoch: [1][36260/51233] Data 0.318 (0.311) Elapsed 322m 30s (remain 133m 9s) Loss: 0.0047(0.0593) Grad: 0.1106  
Epoch: [1][36280/51233] Data 0.317 (0.311) Elapsed 322m 41s (remain 132m 59s) Loss: 0.1851(0.0593) Grad: 1.8052  
Epoch: [1][36300/51233] Data 0.308 (0.311) Elapsed 322m 52s (remain 132m 48s) Loss: 0.0032(0.0593) Grad: 0.0341  
Epoch: [1][36320/51233] Data 0.317 (0.311) Elapsed 323m 2s (remain 132m 37s) Loss: 0.0123(0.0593) Grad: 0.2584  
Epoch: [1][36340/51233] Data 0.317 (0.311) Elapsed 323m 13s (remain 132m 27s) Loss: 0.1810(0.0593) Grad: 2.1375  
Epoch: [1][36360/51233] Data 0.307 (0.311) Elapsed 323m 24s (remain 132m 16s) Loss: 0.1710(0.0593) Grad: 1.4845  
Epoch: [1][36380/51233] Data 0.316 (0.311) Elapsed 323m 34s (remain 132m 5s) Loss: 0.0753(0.0593) Grad: 0.5996  
Epoch: [1][36400/51233] Data 0.306 (0.311) Elapsed 323m 45s (remain 131m 55s) Loss: 0.0025(0.0593) Grad: 0.0459  
Epoch: [1][36420/51233] Data 0.306 (0.311) Elapsed 323m 56s (remain 131m 44s) Loss: 0.0011(0.0593) Grad: 0.0202  
Epoch: [1][36440/51233] Data 0.308 (0.311) Elapsed 324m 6s (remain 131m 33s) Loss: 0.0115(0.0593) Grad: 0.2160  
Epoch: [1][36460/51233] Data 0.311 (0.311) Elapsed 324m 17s (remain 131m 23s) Loss: 0.0613(0.0593) Grad: 0.9192  
Epoch: [1][36480/51233] Data 0.309 (0.311) Elapsed 324m 28s (remain 131m 12s) Loss: 0.0329(0.0593) Grad: 0.5326  
Epoch: [1][36500/51233] Data 0.303 (0.311) Elapsed 324m 39s (remain 131m 1s) Loss: 0.0045(0.0593) Grad: 0.0574  
Epoch: [1][36520/51233] Data 0.317 (0.311) Elapsed 324m 49s (remain 130m 51s) Loss: 0.0269(0.0593) Grad: 0.4744  
Epoch: [1][36540/51233] Data 0.319 (0.311) Elapsed 325m 0s (remain 130m 40s) Loss: 0.0223(0.0593) Grad: 0.3704  
Epoch: [1][36560/51233] Data 0.304 (0.311) Elapsed 325m 11s (remain 130m 29s) Loss: 0.0558(0.0593) Grad: 1.0921  
Epoch: [1][36580/51233] Data 0.302 (0.311) Elapsed 325m 21s (remain 130m 19s) Loss: 0.0068(0.0593) Grad: 0.1244  
Epoch: [1][36600/51233] Data 0.318 (0.311) Elapsed 325m 32s (remain 130m 8s) Loss: 0.0118(0.0593) Grad: 0.1548  
Epoch: [1][36620/51233] Data 0.318 (0.311) Elapsed 325m 43s (remain 129m 57s) Loss: 0.0920(0.0593) Grad: 1.5229  
Epoch: [1][36640/51233] Data 0.317 (0.311) Elapsed 325m 53s (remain 129m 47s) Loss: 0.0216(0.0592) Grad: 0.4251  
Epoch: [1][36660/51233] Data 0.317 (0.311) Elapsed 326m 4s (remain 129m 36s) Loss: 0.0015(0.0592) Grad: 0.0159  
Epoch: [1][36680/51233] Data 0.304 (0.311) Elapsed 326m 15s (remain 129m 25s) Loss: 0.0308(0.0592) Grad: 0.5310  
Epoch: [1][36700/51233] Data 0.304 (0.311) Elapsed 326m 25s (remain 129m 15s) Loss: 0.0062(0.0592) Grad: 0.0769  
Epoch: [1][36720/51233] Data 0.311 (0.311) Elapsed 326m 36s (remain 129m 4s) Loss: 0.0022(0.0592) Grad: 0.0399  
Epoch: [1][36740/51233] Data 0.319 (0.311) Elapsed 326m 47s (remain 128m 53s) Loss: 0.3419(0.0592) Grad: 1.9508  
Epoch: [1][36760/51233] Data 0.317 (0.311) Elapsed 326m 57s (remain 128m 43s) Loss: 0.3099(0.0592) Grad: 2.3138  
Epoch: [1][36780/51233] Data 0.308 (0.311) Elapsed 327m 8s (remain 128m 32s) Loss: 0.0070(0.0592) Grad: 0.1236  
Epoch: [1][36800/51233] Data 0.309 (0.311) Elapsed 327m 19s (remain 128m 21s) Loss: 0.0122(0.0592) Grad: 0.2120  
Epoch: [1][36820/51233] Data 0.316 (0.311) Elapsed 327m 30s (remain 128m 11s) Loss: 0.0119(0.0592) Grad: 0.2819  
Epoch: [1][36840/51233] Data 0.309 (0.311) Elapsed 327m 40s (remain 128m 0s) Loss: 0.2596(0.0592) Grad: 1.3022  
Epoch: [1][36860/51233] Data 0.317 (0.311) Elapsed 327m 51s (remain 127m 49s) Loss: 0.0017(0.0592) Grad: 0.0244  
Epoch: [1][36880/51233] Data 0.315 (0.311) Elapsed 328m 2s (remain 127m 39s) Loss: 0.0012(0.0592) Grad: 0.0176  
Epoch: [1][36900/51233] Data 0.315 (0.311) Elapsed 328m 12s (remain 127m 28s) Loss: 0.2099(0.0592) Grad: 2.3237  
Epoch: [1][36920/51233] Data 0.318 (0.311) Elapsed 328m 23s (remain 127m 17s) Loss: 0.0073(0.0592) Grad: 0.1665  
Epoch: [1][36940/51233] Data 0.318 (0.311) Elapsed 328m 34s (remain 127m 7s) Loss: 0.0463(0.0592) Grad: 0.7530  
Epoch: [1][36960/51233] Data 0.307 (0.311) Elapsed 328m 44s (remain 126m 56s) Loss: 0.0008(0.0592) Grad: 0.0168  
Epoch: [1][36980/51233] Data 0.318 (0.311) Elapsed 328m 55s (remain 126m 45s) Loss: 0.0020(0.0591) Grad: 0.0346  
Epoch: [1][37000/51233] Data 0.316 (0.311) Elapsed 329m 6s (remain 126m 35s) Loss: 0.0046(0.0591) Grad: 0.1155  
Epoch: [1][37020/51233] Data 0.317 (0.311) Elapsed 329m 16s (remain 126m 24s) Loss: 0.0518(0.0591) Grad: 0.9978  
Epoch: [1][37040/51233] Data 0.318 (0.311) Elapsed 329m 27s (remain 126m 13s) Loss: 0.1957(0.0591) Grad: 2.4914  
Epoch: [1][37060/51233] Data 0.304 (0.311) Elapsed 329m 38s (remain 126m 3s) Loss: 0.0047(0.0591) Grad: 0.0628  
Epoch: [1][37080/51233] Data 0.304 (0.311) Elapsed 329m 48s (remain 125m 52s) Loss: 0.0602(0.0591) Grad: 1.6981  
Epoch: [1][37100/51233] Data 0.309 (0.311) Elapsed 329m 59s (remain 125m 41s) Loss: 0.4267(0.0591) Grad: 1.2508  
Epoch: [1][37120/51233] Data 0.309 (0.311) Elapsed 330m 10s (remain 125m 31s) Loss: 0.0955(0.0591) Grad: 1.0541  
Epoch: [1][37140/51233] Data 0.305 (0.311) Elapsed 330m 20s (remain 125m 20s) Loss: 0.0190(0.0591) Grad: 0.2844  
Epoch: [1][37160/51233] Data 0.317 (0.311) Elapsed 330m 31s (remain 125m 9s) Loss: 0.0033(0.0591) Grad: 0.0321  
Epoch: [1][37180/51233] Data 0.318 (0.311) Elapsed 330m 42s (remain 124m 59s) Loss: 0.0106(0.0591) Grad: 0.1648  
Epoch: [1][37200/51233] Data 0.318 (0.311) Elapsed 330m 52s (remain 124m 48s) Loss: 0.0054(0.0591) Grad: 0.1546  
Epoch: [1][37220/51233] Data 0.305 (0.311) Elapsed 331m 3s (remain 124m 37s) Loss: 0.0894(0.0591) Grad: 0.9823  
Epoch: [1][37240/51233] Data 0.318 (0.311) Elapsed 331m 14s (remain 124m 27s) Loss: 0.0410(0.0591) Grad: 0.7596  
Epoch: [1][37260/51233] Data 0.317 (0.311) Elapsed 331m 24s (remain 124m 16s) Loss: 0.0557(0.0591) Grad: 0.6721  
Epoch: [1][37280/51233] Data 0.307 (0.311) Elapsed 331m 35s (remain 124m 5s) Loss: 0.0301(0.0591) Grad: 0.5046  
Epoch: [1][37300/51233] Data 0.318 (0.311) Elapsed 331m 46s (remain 123m 54s) Loss: 0.1147(0.0591) Grad: 2.7283  
Epoch: [1][37320/51233] Data 0.317 (0.311) Elapsed 331m 56s (remain 123m 44s) Loss: 0.0043(0.0590) Grad: 0.0721  
Epoch: [1][37340/51233] Data 0.317 (0.311) Elapsed 332m 7s (remain 123m 33s) Loss: 0.0052(0.0590) Grad: 0.1232  
Epoch: [1][37360/51233] Data 0.308 (0.311) Elapsed 332m 18s (remain 123m 22s) Loss: 0.1503(0.0590) Grad: 1.6752  
Epoch: [1][37380/51233] Data 0.318 (0.311) Elapsed 332m 28s (remain 123m 12s) Loss: 0.0007(0.0590) Grad: 0.0076  
Epoch: [1][37400/51233] Data 0.307 (0.311) Elapsed 332m 39s (remain 123m 1s) Loss: 0.0055(0.0590) Grad: 0.1581  
Epoch: [1][37420/51233] Data 0.299 (0.311) Elapsed 332m 50s (remain 122m 50s) Loss: 0.0002(0.0590) Grad: 0.0016  
Epoch: [1][37440/51233] Data 0.316 (0.311) Elapsed 333m 0s (remain 122m 40s) Loss: 0.0079(0.0590) Grad: 0.1438  
Epoch: [1][37460/51233] Data 0.318 (0.311) Elapsed 333m 11s (remain 122m 29s) Loss: 0.0549(0.0590) Grad: 0.6024  
Epoch: [1][37480/51233] Data 0.313 (0.311) Elapsed 333m 22s (remain 122m 18s) Loss: 0.0394(0.0590) Grad: 1.1094  
Epoch: [1][37500/51233] Data 0.033 (0.311) Elapsed 333m 32s (remain 122m 8s) Loss: 0.0426(0.0590) Grad: 0.7154  
Epoch: [1][37520/51233] Data 0.306 (0.311) Elapsed 333m 43s (remain 121m 57s) Loss: 0.1576(0.0590) Grad: 2.0429  
Epoch: [1][37540/51233] Data 0.317 (0.311) Elapsed 333m 54s (remain 121m 46s) Loss: 0.0006(0.0590) Grad: 0.0078  
Epoch: [1][37560/51233] Data 0.310 (0.310) Elapsed 334m 4s (remain 121m 36s) Loss: 0.0004(0.0590) Grad: 0.0041  
Epoch: [1][37580/51233] Data 0.305 (0.311) Elapsed 334m 15s (remain 121m 25s) Loss: 0.0049(0.0590) Grad: 0.0716  
Epoch: [1][37600/51233] Data 0.317 (0.311) Elapsed 334m 26s (remain 121m 14s) Loss: 0.0233(0.0590) Grad: 0.5657  
Epoch: [1][37620/51233] Data 0.317 (0.311) Elapsed 334m 37s (remain 121m 4s) Loss: 0.0344(0.0590) Grad: 0.5175  
Epoch: [1][37640/51233] Data 0.318 (0.311) Elapsed 334m 47s (remain 120m 53s) Loss: 0.0998(0.0589) Grad: 1.1474  
Epoch: [1][37660/51233] Data 0.309 (0.311) Elapsed 334m 58s (remain 120m 42s) Loss: 0.0575(0.0589) Grad: 0.8291  
Epoch: [1][37680/51233] Data 0.317 (0.311) Elapsed 335m 9s (remain 120m 32s) Loss: 0.0447(0.0589) Grad: 0.5899  
Epoch: [1][37700/51233] Data 0.309 (0.310) Elapsed 335m 19s (remain 120m 21s) Loss: 0.0011(0.0589) Grad: 0.0292  
Epoch: [1][37720/51233] Data 0.301 (0.311) Elapsed 335m 30s (remain 120m 10s) Loss: 0.0014(0.0589) Grad: 0.0155  
Epoch: [1][37740/51233] Data 0.308 (0.311) Elapsed 335m 41s (remain 120m 0s) Loss: 0.0228(0.0589) Grad: 0.4569  
Epoch: [1][37760/51233] Data 0.299 (0.311) Elapsed 335m 51s (remain 119m 49s) Loss: 0.1455(0.0589) Grad: 3.9630  
Epoch: [1][37780/51233] Data 0.312 (0.311) Elapsed 336m 2s (remain 119m 38s) Loss: 0.3811(0.0589) Grad: 1.4414  
Epoch: [1][37800/51233] Data 0.315 (0.311) Elapsed 336m 13s (remain 119m 28s) Loss: 0.0163(0.0589) Grad: 0.3867  
Epoch: [1][37820/51233] Data 0.303 (0.311) Elapsed 336m 23s (remain 119m 17s) Loss: 0.0375(0.0589) Grad: 0.6396  
Epoch: [1][37840/51233] Data 0.301 (0.311) Elapsed 336m 34s (remain 119m 6s) Loss: 0.0316(0.0589) Grad: 0.4375  
Epoch: [1][37860/51233] Data 0.317 (0.311) Elapsed 336m 45s (remain 118m 56s) Loss: 0.0145(0.0589) Grad: 0.1928  
Epoch: [1][37880/51233] Data 0.318 (0.311) Elapsed 336m 55s (remain 118m 45s) Loss: 0.0137(0.0589) Grad: 0.3657  
Epoch: [1][37900/51233] Data 0.318 (0.311) Elapsed 337m 6s (remain 118m 34s) Loss: 0.0051(0.0589) Grad: 0.0658  
Epoch: [1][37920/51233] Data 0.307 (0.311) Elapsed 337m 17s (remain 118m 24s) Loss: 0.0126(0.0589) Grad: 0.1953  
Epoch: [1][37940/51233] Data 0.308 (0.311) Elapsed 337m 27s (remain 118m 13s) Loss: 0.0173(0.0589) Grad: 0.4120  
Epoch: [1][37960/51233] Data 0.307 (0.311) Elapsed 337m 38s (remain 118m 2s) Loss: 0.0172(0.0589) Grad: 0.2845  
Epoch: [1][37980/51233] Data 0.297 (0.311) Elapsed 337m 49s (remain 117m 52s) Loss: 0.0016(0.0589) Grad: 0.0168  
Epoch: [1][38000/51233] Data 0.309 (0.311) Elapsed 338m 0s (remain 117m 41s) Loss: 0.0452(0.0589) Grad: 0.9704  
Epoch: [1][38020/51233] Data 0.304 (0.311) Elapsed 338m 10s (remain 117m 30s) Loss: 0.0511(0.0589) Grad: 1.0393  
Epoch: [1][38040/51233] Data 0.308 (0.311) Elapsed 338m 21s (remain 117m 20s) Loss: 0.0162(0.0589) Grad: 0.3435  
Epoch: [1][38060/51233] Data 0.306 (0.311) Elapsed 338m 32s (remain 117m 9s) Loss: 0.0006(0.0589) Grad: 0.0095  
Epoch: [1][38080/51233] Data 0.301 (0.311) Elapsed 338m 42s (remain 116m 58s) Loss: 0.0138(0.0589) Grad: 0.2716  
Epoch: [1][38100/51233] Data 0.313 (0.311) Elapsed 338m 53s (remain 116m 48s) Loss: 0.0208(0.0589) Grad: 0.4104  
Epoch: [1][38120/51233] Data 0.317 (0.311) Elapsed 339m 4s (remain 116m 37s) Loss: 0.0394(0.0589) Grad: 0.5679  
Epoch: [1][38140/51233] Data 0.307 (0.311) Elapsed 339m 14s (remain 116m 26s) Loss: 0.0052(0.0589) Grad: 0.1843  
Epoch: [1][38160/51233] Data 0.308 (0.311) Elapsed 339m 25s (remain 116m 16s) Loss: 0.0022(0.0588) Grad: 0.0296  
Epoch: [1][38180/51233] Data 0.308 (0.311) Elapsed 339m 36s (remain 116m 5s) Loss: 0.0011(0.0588) Grad: 0.0291  
Epoch: [1][38200/51233] Data 0.310 (0.311) Elapsed 339m 46s (remain 115m 54s) Loss: 0.0368(0.0588) Grad: 0.6066  
Epoch: [1][38220/51233] Data 0.309 (0.311) Elapsed 339m 57s (remain 115m 44s) Loss: 0.0035(0.0588) Grad: 0.0646  
Epoch: [1][38240/51233] Data 0.317 (0.311) Elapsed 340m 8s (remain 115m 33s) Loss: 0.0032(0.0588) Grad: 0.0452  
Epoch: [1][38260/51233] Data 0.313 (0.311) Elapsed 340m 18s (remain 115m 22s) Loss: 0.0101(0.0589) Grad: 0.1493  
Epoch: [1][38280/51233] Data 0.318 (0.311) Elapsed 340m 29s (remain 115m 12s) Loss: 0.0067(0.0589) Grad: 0.0919  
Epoch: [1][38300/51233] Data 0.309 (0.311) Elapsed 340m 40s (remain 115m 1s) Loss: 0.0261(0.0589) Grad: 0.4319  
Epoch: [1][38320/51233] Data 0.318 (0.311) Elapsed 340m 51s (remain 114m 50s) Loss: 0.0830(0.0589) Grad: 1.0306  
Epoch: [1][38340/51233] Data 0.318 (0.311) Elapsed 341m 1s (remain 114m 40s) Loss: 0.0015(0.0588) Grad: 0.0171  
Epoch: [1][38360/51233] Data 0.314 (0.311) Elapsed 341m 12s (remain 114m 29s) Loss: 0.1398(0.0588) Grad: 1.3936  
Epoch: [1][38380/51233] Data 0.303 (0.311) Elapsed 341m 23s (remain 114m 18s) Loss: 0.2664(0.0588) Grad: 1.5991  
Epoch: [1][38400/51233] Data 0.304 (0.311) Elapsed 341m 33s (remain 114m 8s) Loss: 0.1325(0.0589) Grad: 1.0585  
Epoch: [1][38420/51233] Data 0.306 (0.311) Elapsed 341m 44s (remain 113m 57s) Loss: 0.0048(0.0589) Grad: 0.0693  
Epoch: [1][38440/51233] Data 0.318 (0.311) Elapsed 341m 55s (remain 113m 46s) Loss: 0.0036(0.0589) Grad: 0.0615  
Epoch: [1][38460/51233] Data 0.308 (0.311) Elapsed 342m 5s (remain 113m 36s) Loss: 0.0072(0.0589) Grad: 0.1051  
Epoch: [1][38480/51233] Data 0.318 (0.311) Elapsed 342m 16s (remain 113m 25s) Loss: 0.0760(0.0589) Grad: 0.8674  
Epoch: [1][38500/51233] Data 0.310 (0.311) Elapsed 342m 27s (remain 113m 14s) Loss: 0.0031(0.0589) Grad: 0.0498  
Epoch: [1][38520/51233] Data 0.312 (0.311) Elapsed 342m 37s (remain 113m 4s) Loss: 0.0005(0.0589) Grad: 0.0093  
Epoch: [1][38540/51233] Data 0.310 (0.311) Elapsed 342m 48s (remain 112m 53s) Loss: 0.0610(0.0589) Grad: 0.3547  
Epoch: [1][38560/51233] Data 0.318 (0.311) Elapsed 342m 59s (remain 112m 42s) Loss: 0.0092(0.0589) Grad: 0.1403  
Epoch: [1][38580/51233] Data 0.317 (0.311) Elapsed 343m 9s (remain 112m 32s) Loss: 0.0028(0.0588) Grad: 0.0370  
Epoch: [1][38600/51233] Data 0.309 (0.311) Elapsed 343m 20s (remain 112m 21s) Loss: 0.0197(0.0588) Grad: 0.4067  
Epoch: [1][38620/51233] Data 0.318 (0.311) Elapsed 343m 31s (remain 112m 10s) Loss: 0.4672(0.0589) Grad: 3.8980  
Epoch: [1][38640/51233] Data 0.309 (0.311) Elapsed 343m 41s (remain 112m 0s) Loss: 0.1037(0.0588) Grad: 1.2738  
Epoch: [1][38660/51233] Data 0.308 (0.311) Elapsed 343m 52s (remain 111m 49s) Loss: 0.3012(0.0588) Grad: 2.3484  
Epoch: [1][38680/51233] Data 0.306 (0.311) Elapsed 344m 3s (remain 111m 38s) Loss: 0.0498(0.0588) Grad: 0.8909  
Epoch: [1][38700/51233] Data 0.305 (0.311) Elapsed 344m 14s (remain 111m 28s) Loss: 0.1549(0.0588) Grad: 1.6519  
Epoch: [1][38720/51233] Data 0.318 (0.311) Elapsed 344m 24s (remain 111m 17s) Loss: 0.0082(0.0588) Grad: 0.1920  
Epoch: [1][38740/51233] Data 0.308 (0.311) Elapsed 344m 35s (remain 111m 6s) Loss: 0.0172(0.0588) Grad: 0.2366  
Epoch: [1][38760/51233] Data 0.308 (0.311) Elapsed 344m 46s (remain 110m 56s) Loss: 0.0045(0.0588) Grad: 0.0896  
Epoch: [1][38780/51233] Data 0.299 (0.311) Elapsed 344m 56s (remain 110m 45s) Loss: 0.1297(0.0588) Grad: 1.7569  
Epoch: [1][38800/51233] Data 0.308 (0.311) Elapsed 345m 7s (remain 110m 34s) Loss: 0.0002(0.0588) Grad: 0.0023  
Epoch: [1][38820/51233] Data 0.308 (0.311) Elapsed 345m 18s (remain 110m 24s) Loss: 0.0408(0.0588) Grad: 1.0214  
Epoch: [1][38840/51233] Data 0.317 (0.311) Elapsed 345m 28s (remain 110m 13s) Loss: 0.0168(0.0588) Grad: 0.2265  
Epoch: [1][38860/51233] Data 0.317 (0.311) Elapsed 345m 39s (remain 110m 2s) Loss: 0.0545(0.0588) Grad: 0.3093  
Epoch: [1][38880/51233] Data 0.306 (0.311) Elapsed 345m 50s (remain 109m 52s) Loss: 0.0177(0.0588) Grad: 0.3532  
Epoch: [1][38900/51233] Data 0.318 (0.311) Elapsed 346m 0s (remain 109m 41s) Loss: 0.2250(0.0588) Grad: 1.4284  
Epoch: [1][38920/51233] Data 0.309 (0.311) Elapsed 346m 11s (remain 109m 30s) Loss: 0.1505(0.0588) Grad: 1.5169  
Epoch: [1][38940/51233] Data 0.310 (0.311) Elapsed 346m 22s (remain 109m 20s) Loss: 0.0014(0.0588) Grad: 0.0186  
Epoch: [1][38960/51233] Data 0.309 (0.311) Elapsed 346m 32s (remain 109m 9s) Loss: 0.0327(0.0587) Grad: 0.6632  
Epoch: [1][38980/51233] Data 0.305 (0.311) Elapsed 346m 43s (remain 108m 58s) Loss: 0.1405(0.0587) Grad: 1.6845  
Epoch: [1][39000/51233] Data 0.305 (0.311) Elapsed 346m 54s (remain 108m 48s) Loss: 0.0099(0.0587) Grad: 0.2182  
Epoch: [1][39020/51233] Data 0.294 (0.311) Elapsed 347m 5s (remain 108m 37s) Loss: 0.0010(0.0587) Grad: 0.0141  
Epoch: [1][39040/51233] Data 0.317 (0.311) Elapsed 347m 15s (remain 108m 26s) Loss: 0.0041(0.0587) Grad: 0.0351  
Epoch: [1][39060/51233] Data 0.306 (0.311) Elapsed 347m 26s (remain 108m 16s) Loss: 0.0097(0.0587) Grad: 0.1153  
Epoch: [1][39080/51233] Data 0.318 (0.311) Elapsed 347m 37s (remain 108m 5s) Loss: 0.0056(0.0587) Grad: 0.0952  
Epoch: [1][39100/51233] Data 0.310 (0.311) Elapsed 347m 47s (remain 107m 54s) Loss: 0.0120(0.0587) Grad: 0.1683  
Epoch: [1][39120/51233] Data 0.318 (0.311) Elapsed 347m 58s (remain 107m 44s) Loss: 0.0356(0.0587) Grad: 0.7475  
Epoch: [1][39140/51233] Data 0.312 (0.311) Elapsed 348m 9s (remain 107m 33s) Loss: 0.1024(0.0587) Grad: 1.1996  
Epoch: [1][39160/51233] Data 0.314 (0.311) Elapsed 348m 19s (remain 107m 22s) Loss: 0.0005(0.0587) Grad: 0.0068  
Epoch: [1][39180/51233] Data 0.318 (0.311) Elapsed 348m 30s (remain 107m 12s) Loss: 0.0166(0.0587) Grad: 0.2760  
Epoch: [1][39200/51233] Data 0.313 (0.311) Elapsed 348m 41s (remain 107m 1s) Loss: 0.0423(0.0587) Grad: 1.0829  
Epoch: [1][39220/51233] Data 0.301 (0.311) Elapsed 348m 51s (remain 106m 50s) Loss: 0.0322(0.0587) Grad: 0.4052  
Epoch: [1][39240/51233] Data 0.318 (0.311) Elapsed 349m 2s (remain 106m 40s) Loss: 0.0724(0.0587) Grad: 0.7437  
Epoch: [1][39260/51233] Data 0.295 (0.311) Elapsed 349m 13s (remain 106m 29s) Loss: 0.0084(0.0587) Grad: 0.1056  
Epoch: [1][39280/51233] Data 0.313 (0.311) Elapsed 349m 23s (remain 106m 18s) Loss: 0.0083(0.0587) Grad: 0.1290  
Epoch: [1][39300/51233] Data 0.318 (0.311) Elapsed 349m 34s (remain 106m 8s) Loss: 0.0018(0.0586) Grad: 0.0200  
Epoch: [1][39320/51233] Data 0.317 (0.311) Elapsed 349m 45s (remain 105m 57s) Loss: 0.2415(0.0587) Grad: 1.5473  
Epoch: [1][39340/51233] Data 0.317 (0.311) Elapsed 349m 56s (remain 105m 46s) Loss: 0.0231(0.0587) Grad: 0.2945  
Epoch: [1][39360/51233] Data 0.318 (0.311) Elapsed 350m 6s (remain 105m 36s) Loss: 0.0098(0.0587) Grad: 0.1691  
Epoch: [1][39380/51233] Data 0.310 (0.311) Elapsed 350m 17s (remain 105m 25s) Loss: 0.0070(0.0587) Grad: 0.0769  
Epoch: [1][39400/51233] Data 0.308 (0.311) Elapsed 350m 28s (remain 105m 14s) Loss: 0.0050(0.0587) Grad: 0.0816  
Epoch: [1][39420/51233] Data 0.303 (0.311) Elapsed 350m 38s (remain 105m 3s) Loss: 0.0605(0.0587) Grad: 2.1017  
Epoch: [1][39440/51233] Data 0.319 (0.311) Elapsed 350m 49s (remain 104m 53s) Loss: 0.2962(0.0587) Grad: 0.8864  
Epoch: [1][39460/51233] Data 0.317 (0.311) Elapsed 351m 0s (remain 104m 42s) Loss: 0.0132(0.0587) Grad: 0.1135  
Epoch: [1][39480/51233] Data 0.313 (0.311) Elapsed 351m 10s (remain 104m 31s) Loss: 0.0007(0.0586) Grad: 0.0076  
Epoch: [1][39500/51233] Data 0.294 (0.311) Elapsed 351m 21s (remain 104m 21s) Loss: 0.0008(0.0586) Grad: 0.0069  
Epoch: [1][39520/51233] Data 0.313 (0.311) Elapsed 351m 32s (remain 104m 10s) Loss: 0.0908(0.0586) Grad: 1.2974  
Epoch: [1][39540/51233] Data 0.318 (0.311) Elapsed 351m 42s (remain 103m 59s) Loss: 0.1827(0.0586) Grad: 1.5080  
Epoch: [1][39560/51233] Data 0.308 (0.311) Elapsed 351m 53s (remain 103m 49s) Loss: 0.0083(0.0586) Grad: 0.1359  
Epoch: [1][39580/51233] Data 0.308 (0.311) Elapsed 352m 4s (remain 103m 38s) Loss: 0.0119(0.0586) Grad: 0.3541  
Epoch: [1][39600/51233] Data 0.305 (0.311) Elapsed 352m 14s (remain 103m 27s) Loss: 0.1033(0.0586) Grad: 1.0767  
Epoch: [1][39620/51233] Data 0.312 (0.311) Elapsed 352m 25s (remain 103m 17s) Loss: 0.0091(0.0586) Grad: 0.1863  
Epoch: [1][39640/51233] Data 0.315 (0.311) Elapsed 352m 36s (remain 103m 6s) Loss: 0.0345(0.0586) Grad: 0.5983  
Epoch: [1][39660/51233] Data 0.317 (0.311) Elapsed 352m 47s (remain 102m 55s) Loss: 0.0840(0.0586) Grad: 1.8463  
Epoch: [1][39680/51233] Data 0.308 (0.311) Elapsed 352m 57s (remain 102m 45s) Loss: 0.0544(0.0586) Grad: 0.5919  
Epoch: [1][39700/51233] Data 0.300 (0.311) Elapsed 353m 8s (remain 102m 34s) Loss: 0.0339(0.0586) Grad: 0.5628  
Epoch: [1][39720/51233] Data 0.310 (0.311) Elapsed 353m 19s (remain 102m 23s) Loss: 0.1211(0.0586) Grad: 1.2220  
Epoch: [1][39740/51233] Data 0.305 (0.311) Elapsed 353m 29s (remain 102m 13s) Loss: 0.0833(0.0586) Grad: 0.8552  
Epoch: [1][39760/51233] Data 0.314 (0.311) Elapsed 353m 40s (remain 102m 2s) Loss: 0.0888(0.0586) Grad: 1.4406  
Epoch: [1][39780/51233] Data 0.315 (0.310) Elapsed 353m 51s (remain 101m 51s) Loss: 0.2356(0.0586) Grad: 2.5380  
Epoch: [1][39800/51233] Data 0.283 (0.310) Elapsed 354m 1s (remain 101m 41s) Loss: 0.0766(0.0586) Grad: 0.6235  
Epoch: [1][39820/51233] Data 0.309 (0.310) Elapsed 354m 12s (remain 101m 30s) Loss: 0.0199(0.0586) Grad: 0.1864  
Epoch: [1][39840/51233] Data 0.310 (0.310) Elapsed 354m 23s (remain 101m 19s) Loss: 0.0166(0.0586) Grad: 0.4214  
Epoch: [1][39860/51233] Data 0.317 (0.310) Elapsed 354m 33s (remain 101m 9s) Loss: 0.0384(0.0586) Grad: 0.9251  
Epoch: [1][39880/51233] Data 0.297 (0.310) Elapsed 354m 44s (remain 100m 58s) Loss: 0.0035(0.0586) Grad: 0.0397  
Epoch: [1][39900/51233] Data 0.318 (0.310) Elapsed 354m 55s (remain 100m 47s) Loss: 0.0154(0.0586) Grad: 0.2314  
Epoch: [1][39920/51233] Data 0.317 (0.310) Elapsed 355m 5s (remain 100m 37s) Loss: 0.0024(0.0586) Grad: 0.0337  
Epoch: [1][39940/51233] Data 0.310 (0.310) Elapsed 355m 16s (remain 100m 26s) Loss: 0.0068(0.0586) Grad: 0.1022  
Epoch: [1][39960/51233] Data 0.315 (0.310) Elapsed 355m 27s (remain 100m 15s) Loss: 0.0108(0.0586) Grad: 0.2336  
Epoch: [1][39980/51233] Data 0.308 (0.310) Elapsed 355m 38s (remain 100m 5s) Loss: 0.0092(0.0586) Grad: 0.1355  
Epoch: [1][40000/51233] Data 0.318 (0.310) Elapsed 355m 48s (remain 99m 54s) Loss: 0.0236(0.0585) Grad: 0.2322  
Epoch: [1][40020/51233] Data 0.305 (0.310) Elapsed 355m 59s (remain 99m 43s) Loss: 0.0014(0.0585) Grad: 0.0165  
Epoch: [1][40040/51233] Data 0.314 (0.310) Elapsed 356m 10s (remain 99m 33s) Loss: 0.1553(0.0585) Grad: 2.0972  
Epoch: [1][40060/51233] Data 0.318 (0.310) Elapsed 356m 20s (remain 99m 22s) Loss: 0.0089(0.0585) Grad: 0.0980  
Epoch: [1][40080/51233] Data 0.302 (0.310) Elapsed 356m 31s (remain 99m 11s) Loss: 0.0133(0.0585) Grad: 0.3305  
Epoch: [1][40100/51233] Data 0.300 (0.310) Elapsed 356m 42s (remain 99m 1s) Loss: 0.0958(0.0585) Grad: 1.3902  
Epoch: [1][40120/51233] Data 0.309 (0.310) Elapsed 356m 52s (remain 98m 50s) Loss: 0.0039(0.0585) Grad: 0.0496  
Epoch: [1][40140/51233] Data 0.317 (0.310) Elapsed 357m 3s (remain 98m 39s) Loss: 0.0877(0.0585) Grad: 0.6524  
Epoch: [1][40160/51233] Data 0.318 (0.310) Elapsed 357m 14s (remain 98m 29s) Loss: 0.0917(0.0585) Grad: 0.6840  
Epoch: [1][40180/51233] Data 0.317 (0.310) Elapsed 357m 24s (remain 98m 18s) Loss: 0.0041(0.0585) Grad: 0.1208  
Epoch: [1][40200/51233] Data 0.318 (0.310) Elapsed 357m 35s (remain 98m 7s) Loss: 0.0022(0.0585) Grad: 0.0495  
Epoch: [1][40220/51233] Data 0.314 (0.310) Elapsed 357m 46s (remain 97m 57s) Loss: 0.0541(0.0585) Grad: 0.3378  
Epoch: [1][40240/51233] Data 0.296 (0.310) Elapsed 357m 56s (remain 97m 46s) Loss: 0.0021(0.0585) Grad: 0.0386  
Epoch: [1][40260/51233] Data 0.308 (0.310) Elapsed 358m 7s (remain 97m 35s) Loss: 0.0220(0.0585) Grad: 0.3211  
Epoch: [1][40280/51233] Data 0.318 (0.310) Elapsed 358m 18s (remain 97m 25s) Loss: 0.1561(0.0585) Grad: 0.9990  
Epoch: [1][40300/51233] Data 0.318 (0.310) Elapsed 358m 29s (remain 97m 14s) Loss: 0.0410(0.0585) Grad: 1.0925  
Epoch: [1][40320/51233] Data 0.307 (0.310) Elapsed 358m 39s (remain 97m 3s) Loss: 0.2340(0.0585) Grad: 1.7276  
Epoch: [1][40340/51233] Data 0.292 (0.310) Elapsed 358m 50s (remain 96m 53s) Loss: 0.0235(0.0585) Grad: 0.3385  
Epoch: [1][40360/51233] Data 0.300 (0.310) Elapsed 359m 1s (remain 96m 42s) Loss: 0.0667(0.0585) Grad: 1.1741  
Epoch: [1][40380/51233] Data 0.318 (0.310) Elapsed 359m 11s (remain 96m 31s) Loss: 0.1322(0.0584) Grad: 1.8051  
Epoch: [1][40400/51233] Data 0.317 (0.310) Elapsed 359m 22s (remain 96m 21s) Loss: 0.0275(0.0584) Grad: 0.3193  
Epoch: [1][40420/51233] Data 0.303 (0.310) Elapsed 359m 33s (remain 96m 10s) Loss: 0.0746(0.0584) Grad: 0.9972  
Epoch: [1][40440/51233] Data 0.318 (0.310) Elapsed 359m 43s (remain 95m 59s) Loss: 0.0056(0.0584) Grad: 0.1125  
Epoch: [1][40460/51233] Data 0.317 (0.310) Elapsed 359m 54s (remain 95m 49s) Loss: 0.0020(0.0584) Grad: 0.0225  
Epoch: [1][40480/51233] Data 0.301 (0.310) Elapsed 360m 5s (remain 95m 38s) Loss: 0.0013(0.0584) Grad: 0.0168  
Epoch: [1][40500/51233] Data 0.318 (0.310) Elapsed 360m 15s (remain 95m 27s) Loss: 0.0623(0.0584) Grad: 0.6840  
Epoch: [1][40520/51233] Data 0.308 (0.310) Elapsed 360m 26s (remain 95m 17s) Loss: 0.0143(0.0584) Grad: 0.5396  
Epoch: [1][40540/51233] Data 0.317 (0.310) Elapsed 360m 37s (remain 95m 6s) Loss: 0.0681(0.0584) Grad: 0.8239  
Epoch: [1][40560/51233] Data 0.314 (0.310) Elapsed 360m 47s (remain 94m 55s) Loss: 0.0155(0.0584) Grad: 0.4075  
Epoch: [1][40580/51233] Data 0.316 (0.310) Elapsed 360m 58s (remain 94m 45s) Loss: 0.0051(0.0584) Grad: 0.0736  
Epoch: [1][40600/51233] Data 0.307 (0.310) Elapsed 361m 9s (remain 94m 34s) Loss: 0.0053(0.0584) Grad: 0.0671  
Epoch: [1][40620/51233] Data 0.314 (0.310) Elapsed 361m 19s (remain 94m 23s) Loss: 0.1484(0.0584) Grad: 2.1557  
Epoch: [1][40640/51233] Data 0.309 (0.310) Elapsed 361m 30s (remain 94m 13s) Loss: 0.0634(0.0584) Grad: 1.3896  
Epoch: [1][40660/51233] Data 0.302 (0.310) Elapsed 361m 41s (remain 94m 2s) Loss: 0.0010(0.0584) Grad: 0.0107  
Epoch: [1][40680/51233] Data 0.297 (0.310) Elapsed 361m 52s (remain 93m 51s) Loss: 0.0050(0.0584) Grad: 0.0898  
Epoch: [1][40700/51233] Data 0.309 (0.310) Elapsed 362m 2s (remain 93m 41s) Loss: 0.0424(0.0584) Grad: 0.6319  
Epoch: [1][40720/51233] Data 0.318 (0.310) Elapsed 362m 13s (remain 93m 30s) Loss: 0.0093(0.0584) Grad: 0.1598  
Epoch: [1][40740/51233] Data 0.310 (0.310) Elapsed 362m 24s (remain 93m 19s) Loss: 0.0042(0.0584) Grad: 0.0844  
Epoch: [1][40760/51233] Data 0.306 (0.310) Elapsed 362m 34s (remain 93m 9s) Loss: 0.1026(0.0584) Grad: 1.4379  
Epoch: [1][40780/51233] Data 0.317 (0.310) Elapsed 362m 45s (remain 92m 58s) Loss: 0.0631(0.0584) Grad: 0.9116  
Epoch: [1][40800/51233] Data 0.303 (0.310) Elapsed 362m 56s (remain 92m 47s) Loss: 0.0674(0.0584) Grad: 1.2123  
Epoch: [1][40820/51233] Data 0.308 (0.310) Elapsed 363m 6s (remain 92m 37s) Loss: 0.0560(0.0583) Grad: 0.9937  
Epoch: [1][40840/51233] Data 0.308 (0.310) Elapsed 363m 17s (remain 92m 26s) Loss: 0.0603(0.0583) Grad: 0.7186  
Epoch: [1][40860/51233] Data 0.299 (0.310) Elapsed 363m 28s (remain 92m 15s) Loss: 0.0169(0.0583) Grad: 0.5203  
Epoch: [1][40880/51233] Data 0.306 (0.310) Elapsed 363m 38s (remain 92m 5s) Loss: 0.1614(0.0583) Grad: 1.1926  
Epoch: [1][40900/51233] Data 0.302 (0.310) Elapsed 363m 49s (remain 91m 54s) Loss: 0.0731(0.0583) Grad: 0.7158  
Epoch: [1][40920/51233] Data 0.299 (0.310) Elapsed 364m 0s (remain 91m 43s) Loss: 0.3157(0.0583) Grad: 2.3560  
Epoch: [1][40940/51233] Data 0.308 (0.310) Elapsed 364m 10s (remain 91m 33s) Loss: 0.1243(0.0583) Grad: 1.2983  
Epoch: [1][40960/51233] Data 0.308 (0.310) Elapsed 364m 21s (remain 91m 22s) Loss: 0.1438(0.0583) Grad: 1.6226  
Epoch: [1][40980/51233] Data 0.311 (0.310) Elapsed 364m 32s (remain 91m 11s) Loss: 0.0572(0.0583) Grad: 1.2211  
Epoch: [1][41000/51233] Data 0.318 (0.310) Elapsed 364m 43s (remain 91m 1s) Loss: 0.0018(0.0583) Grad: 0.0280  
Epoch: [1][41020/51233] Data 0.318 (0.310) Elapsed 364m 53s (remain 90m 50s) Loss: 0.0164(0.0583) Grad: 0.3465  
Epoch: [1][41040/51233] Data 0.318 (0.310) Elapsed 365m 4s (remain 90m 39s) Loss: 0.1267(0.0583) Grad: 0.9002  
Epoch: [1][41060/51233] Data 0.317 (0.310) Elapsed 365m 15s (remain 90m 29s) Loss: 0.0338(0.0583) Grad: 0.3137  
Epoch: [1][41080/51233] Data 0.305 (0.310) Elapsed 365m 25s (remain 90m 18s) Loss: 0.2505(0.0583) Grad: 1.5175  
Epoch: [1][41100/51233] Data 0.310 (0.310) Elapsed 365m 36s (remain 90m 7s) Loss: 0.0014(0.0583) Grad: 0.0178  
Epoch: [1][41120/51233] Data 0.317 (0.310) Elapsed 365m 47s (remain 89m 56s) Loss: 0.0138(0.0583) Grad: 0.2896  
Epoch: [1][41140/51233] Data 0.305 (0.310) Elapsed 365m 57s (remain 89m 46s) Loss: 0.0089(0.0583) Grad: 0.1400  
Epoch: [1][41160/51233] Data 0.317 (0.310) Elapsed 366m 8s (remain 89m 35s) Loss: 0.0105(0.0583) Grad: 0.2796  
Epoch: [1][41180/51233] Data 0.305 (0.310) Elapsed 366m 19s (remain 89m 24s) Loss: 0.0115(0.0583) Grad: 0.3603  
Epoch: [1][41200/51233] Data 0.314 (0.310) Elapsed 366m 29s (remain 89m 14s) Loss: 0.0076(0.0583) Grad: 0.1482  
Epoch: [1][41220/51233] Data 0.304 (0.310) Elapsed 366m 40s (remain 89m 3s) Loss: 0.0329(0.0583) Grad: 0.5001  
Epoch: [1][41240/51233] Data 0.295 (0.310) Elapsed 366m 51s (remain 88m 52s) Loss: 0.0297(0.0583) Grad: 0.4278  
Epoch: [1][41260/51233] Data 0.308 (0.310) Elapsed 367m 1s (remain 88m 42s) Loss: 0.0306(0.0583) Grad: 0.5667  
Epoch: [1][41280/51233] Data 0.299 (0.310) Elapsed 367m 12s (remain 88m 31s) Loss: 0.0026(0.0583) Grad: 0.0345  
Epoch: [1][41300/51233] Data 0.308 (0.310) Elapsed 367m 23s (remain 88m 20s) Loss: 0.0025(0.0583) Grad: 0.0348  
Epoch: [1][41320/51233] Data 0.313 (0.310) Elapsed 367m 34s (remain 88m 10s) Loss: 0.0117(0.0583) Grad: 0.2506  
Epoch: [1][41340/51233] Data 0.300 (0.310) Elapsed 367m 44s (remain 87m 59s) Loss: 0.0315(0.0582) Grad: 1.1952  
Epoch: [1][41360/51233] Data 0.309 (0.310) Elapsed 367m 55s (remain 87m 48s) Loss: 0.0375(0.0582) Grad: 0.6037  
Epoch: [1][41380/51233] Data 0.317 (0.310) Elapsed 368m 6s (remain 87m 38s) Loss: 0.0018(0.0582) Grad: 0.0239  
Epoch: [1][41400/51233] Data 0.307 (0.310) Elapsed 368m 16s (remain 87m 27s) Loss: 0.0057(0.0582) Grad: 0.0625  
Epoch: [1][41420/51233] Data 0.311 (0.310) Elapsed 368m 27s (remain 87m 16s) Loss: 0.0074(0.0582) Grad: 0.1459  
Epoch: [1][41440/51233] Data 0.308 (0.310) Elapsed 368m 38s (remain 87m 6s) Loss: 0.0496(0.0582) Grad: 0.8179  
Epoch: [1][41460/51233] Data 0.309 (0.310) Elapsed 368m 48s (remain 86m 55s) Loss: 0.0012(0.0582) Grad: 0.0161  
Epoch: [1][41480/51233] Data 0.317 (0.310) Elapsed 368m 59s (remain 86m 44s) Loss: 0.0171(0.0582) Grad: 0.3953  
Epoch: [1][41500/51233] Data 0.316 (0.310) Elapsed 369m 10s (remain 86m 34s) Loss: 0.0018(0.0582) Grad: 0.0220  
Epoch: [1][41520/51233] Data 0.301 (0.310) Elapsed 369m 20s (remain 86m 23s) Loss: 0.2330(0.0582) Grad: 1.9782  
Epoch: [1][41540/51233] Data 0.316 (0.310) Elapsed 369m 31s (remain 86m 12s) Loss: 0.0448(0.0582) Grad: 1.0108  
Epoch: [1][41560/51233] Data 0.298 (0.310) Elapsed 369m 42s (remain 86m 2s) Loss: 0.0006(0.0582) Grad: 0.0120  
Epoch: [1][41580/51233] Data 0.318 (0.310) Elapsed 369m 52s (remain 85m 51s) Loss: 0.0008(0.0582) Grad: 0.0091  
Epoch: [1][41600/51233] Data 0.315 (0.310) Elapsed 370m 3s (remain 85m 40s) Loss: 0.1535(0.0582) Grad: 1.3704  
Epoch: [1][41620/51233] Data 0.318 (0.310) Elapsed 370m 14s (remain 85m 30s) Loss: 0.5363(0.0582) Grad: 2.3231  
Epoch: [1][41640/51233] Data 0.314 (0.310) Elapsed 370m 25s (remain 85m 19s) Loss: 0.1176(0.0582) Grad: 0.9332  
Epoch: [1][41660/51233] Data 0.315 (0.310) Elapsed 370m 35s (remain 85m 8s) Loss: 0.0074(0.0581) Grad: 0.1527  
Epoch: [1][41680/51233] Data 0.306 (0.310) Elapsed 370m 46s (remain 84m 58s) Loss: 0.0157(0.0581) Grad: 0.2910  
Epoch: [1][41700/51233] Data 0.317 (0.310) Elapsed 370m 57s (remain 84m 47s) Loss: 0.0106(0.0581) Grad: 0.2021  
Epoch: [1][41720/51233] Data 0.307 (0.310) Elapsed 371m 7s (remain 84m 36s) Loss: 0.0074(0.0581) Grad: 0.1426  
Epoch: [1][41740/51233] Data 0.309 (0.310) Elapsed 371m 18s (remain 84m 26s) Loss: 0.1198(0.0581) Grad: 1.0019  
Epoch: [1][41760/51233] Data 0.306 (0.310) Elapsed 371m 29s (remain 84m 15s) Loss: 0.0039(0.0581) Grad: 0.0693  
Epoch: [1][41780/51233] Data 0.317 (0.310) Elapsed 371m 39s (remain 84m 4s) Loss: 0.3229(0.0581) Grad: 2.5108  
Epoch: [1][41800/51233] Data 0.309 (0.310) Elapsed 371m 50s (remain 83m 54s) Loss: 0.0230(0.0581) Grad: 0.3392  
Epoch: [1][41820/51233] Data 0.307 (0.310) Elapsed 372m 1s (remain 83m 43s) Loss: 0.0048(0.0581) Grad: 0.0528  
Epoch: [1][41840/51233] Data 0.301 (0.310) Elapsed 372m 11s (remain 83m 32s) Loss: 0.1020(0.0581) Grad: 1.0329  
Epoch: [1][41860/51233] Data 0.317 (0.310) Elapsed 372m 22s (remain 83m 22s) Loss: 0.1232(0.0581) Grad: 1.6647  
Epoch: [1][41880/51233] Data 0.317 (0.310) Elapsed 372m 33s (remain 83m 11s) Loss: 0.0059(0.0581) Grad: 0.1242  
Epoch: [1][41900/51233] Data 0.309 (0.310) Elapsed 372m 43s (remain 83m 0s) Loss: 0.0373(0.0581) Grad: 0.4820  
Epoch: [1][41920/51233] Data 0.318 (0.310) Elapsed 372m 54s (remain 82m 50s) Loss: 0.3109(0.0581) Grad: 2.8153  
Epoch: [1][41940/51233] Data 0.308 (0.310) Elapsed 373m 5s (remain 82m 39s) Loss: 0.0048(0.0581) Grad: 0.0632  
Epoch: [1][41960/51233] Data 0.311 (0.310) Elapsed 373m 15s (remain 82m 28s) Loss: 0.2203(0.0581) Grad: 2.9789  
Epoch: [1][41980/51233] Data 0.317 (0.310) Elapsed 373m 26s (remain 82m 18s) Loss: 0.0098(0.0581) Grad: 0.1043  
Epoch: [1][42000/51233] Data 0.312 (0.310) Elapsed 373m 37s (remain 82m 7s) Loss: 0.0076(0.0581) Grad: 0.1232  
Epoch: [1][42020/51233] Data 0.318 (0.310) Elapsed 373m 47s (remain 81m 56s) Loss: 0.0613(0.0581) Grad: 0.9956  
Epoch: [1][42040/51233] Data 0.305 (0.310) Elapsed 373m 58s (remain 81m 46s) Loss: 0.0188(0.0581) Grad: 0.2212  
Epoch: [1][42060/51233] Data 0.317 (0.310) Elapsed 374m 9s (remain 81m 35s) Loss: 0.0465(0.0581) Grad: 1.3459  
Epoch: [1][42080/51233] Data 0.309 (0.310) Elapsed 374m 19s (remain 81m 24s) Loss: 0.0403(0.0581) Grad: 0.4575  
Epoch: [1][42100/51233] Data 0.308 (0.310) Elapsed 374m 30s (remain 81m 13s) Loss: 0.0363(0.0581) Grad: 0.6593  
Epoch: [1][42120/51233] Data 0.309 (0.310) Elapsed 374m 41s (remain 81m 3s) Loss: 0.0070(0.0581) Grad: 0.1290  
Epoch: [1][42140/51233] Data 0.271 (0.310) Elapsed 374m 51s (remain 80m 52s) Loss: 0.0268(0.0581) Grad: 0.6157  
Epoch: [1][42160/51233] Data 0.307 (0.310) Elapsed 375m 2s (remain 80m 41s) Loss: 0.0537(0.0581) Grad: 1.0956  
Epoch: [1][42180/51233] Data 0.317 (0.310) Elapsed 375m 13s (remain 80m 31s) Loss: 0.1251(0.0580) Grad: 1.3328  
Epoch: [1][42200/51233] Data 0.305 (0.310) Elapsed 375m 23s (remain 80m 20s) Loss: 0.0009(0.0580) Grad: 0.0184  
Epoch: [1][42220/51233] Data 0.308 (0.310) Elapsed 375m 34s (remain 80m 9s) Loss: 0.0235(0.0580) Grad: 0.3052  
Epoch: [1][42240/51233] Data 0.306 (0.310) Elapsed 375m 45s (remain 79m 59s) Loss: 0.0066(0.0580) Grad: 0.1473  
Epoch: [1][42260/51233] Data 0.293 (0.310) Elapsed 375m 55s (remain 79m 48s) Loss: 0.0124(0.0580) Grad: 0.3276  
Epoch: [1][42280/51233] Data 0.315 (0.310) Elapsed 376m 6s (remain 79m 37s) Loss: 0.0788(0.0580) Grad: 1.4740  
Epoch: [1][42300/51233] Data 0.315 (0.310) Elapsed 376m 17s (remain 79m 27s) Loss: 0.3630(0.0580) Grad: 1.5209  
Epoch: [1][42320/51233] Data 0.310 (0.310) Elapsed 376m 27s (remain 79m 16s) Loss: 0.3283(0.0580) Grad: 1.9688  
Epoch: [1][42340/51233] Data 0.312 (0.310) Elapsed 376m 38s (remain 79m 5s) Loss: 0.0123(0.0580) Grad: 0.2301  
Epoch: [1][42360/51233] Data 0.308 (0.310) Elapsed 376m 49s (remain 78m 55s) Loss: 0.0087(0.0580) Grad: 0.1186  
Epoch: [1][42380/51233] Data 0.310 (0.310) Elapsed 377m 0s (remain 78m 44s) Loss: 0.0065(0.0580) Grad: 0.1205  
Epoch: [1][42400/51233] Data 0.318 (0.310) Elapsed 377m 10s (remain 78m 33s) Loss: 0.0145(0.0580) Grad: 0.3363  
Epoch: [1][42420/51233] Data 0.317 (0.310) Elapsed 377m 21s (remain 78m 23s) Loss: 0.0622(0.0580) Grad: 1.0522  
Epoch: [1][42440/51233] Data 0.309 (0.310) Elapsed 377m 32s (remain 78m 12s) Loss: 0.2573(0.0580) Grad: 1.8380  
Epoch: [1][42460/51233] Data 0.305 (0.310) Elapsed 377m 42s (remain 78m 1s) Loss: 0.1196(0.0580) Grad: 2.1808  
Epoch: [1][42480/51233] Data 0.299 (0.310) Elapsed 377m 53s (remain 77m 51s) Loss: 0.0042(0.0580) Grad: 0.0676  
Epoch: [1][42500/51233] Data 0.317 (0.310) Elapsed 378m 4s (remain 77m 40s) Loss: 0.0027(0.0580) Grad: 0.1022  
Epoch: [1][42520/51233] Data 0.318 (0.310) Elapsed 378m 14s (remain 77m 29s) Loss: 0.0131(0.0580) Grad: 0.2453  
Epoch: [1][42540/51233] Data 0.311 (0.310) Elapsed 378m 25s (remain 77m 19s) Loss: 0.0673(0.0580) Grad: 1.5015  
Epoch: [1][42560/51233] Data 0.308 (0.310) Elapsed 378m 36s (remain 77m 8s) Loss: 0.0622(0.0580) Grad: 0.6749  
Epoch: [1][42580/51233] Data 0.318 (0.310) Elapsed 378m 46s (remain 76m 57s) Loss: 0.0420(0.0580) Grad: 0.6598  
Epoch: [1][42600/51233] Data 0.308 (0.310) Elapsed 378m 57s (remain 76m 47s) Loss: 0.0205(0.0580) Grad: 0.2665  
Epoch: [1][42620/51233] Data 0.314 (0.310) Elapsed 379m 8s (remain 76m 36s) Loss: 0.2067(0.0580) Grad: 1.3612  
Epoch: [1][42640/51233] Data 0.314 (0.310) Elapsed 379m 18s (remain 76m 25s) Loss: 0.0491(0.0580) Grad: 1.2710  
Epoch: [1][42660/51233] Data 0.309 (0.310) Elapsed 379m 29s (remain 76m 15s) Loss: 0.0003(0.0580) Grad: 0.0053  
Epoch: [1][42680/51233] Data 0.307 (0.310) Elapsed 379m 40s (remain 76m 4s) Loss: 0.0922(0.0580) Grad: 1.5444  
Epoch: [1][42700/51233] Data 0.317 (0.310) Elapsed 379m 51s (remain 75m 53s) Loss: 0.0027(0.0580) Grad: 0.0660  
Epoch: [1][42720/51233] Data 0.318 (0.310) Elapsed 380m 1s (remain 75m 43s) Loss: 0.1230(0.0580) Grad: 1.8149  
Epoch: [1][42740/51233] Data 0.317 (0.310) Elapsed 380m 12s (remain 75m 32s) Loss: 0.0388(0.0580) Grad: 0.5876  
Epoch: [1][42760/51233] Data 0.306 (0.310) Elapsed 380m 23s (remain 75m 21s) Loss: 0.0138(0.0580) Grad: 0.3103  
Epoch: [1][42780/51233] Data 0.317 (0.310) Elapsed 380m 33s (remain 75m 11s) Loss: 0.0150(0.0579) Grad: 0.2134  
Epoch: [1][42800/51233] Data 0.309 (0.310) Elapsed 380m 44s (remain 75m 0s) Loss: 0.0976(0.0579) Grad: 1.3956  
Epoch: [1][42820/51233] Data 0.310 (0.310) Elapsed 380m 55s (remain 74m 49s) Loss: 0.0155(0.0579) Grad: 0.3244  
Epoch: [1][42840/51233] Data 0.309 (0.310) Elapsed 381m 5s (remain 74m 39s) Loss: 0.0025(0.0579) Grad: 0.0388  
Epoch: [1][42860/51233] Data 0.305 (0.310) Elapsed 381m 16s (remain 74m 28s) Loss: 0.0008(0.0579) Grad: 0.0100  
Epoch: [1][42880/51233] Data 0.309 (0.310) Elapsed 381m 27s (remain 74m 17s) Loss: 0.2214(0.0579) Grad: 1.9772  
Epoch: [1][42900/51233] Data 0.303 (0.310) Elapsed 381m 37s (remain 74m 7s) Loss: 0.2674(0.0579) Grad: 2.0537  
Epoch: [1][42920/51233] Data 0.305 (0.310) Elapsed 381m 48s (remain 73m 56s) Loss: 0.1116(0.0579) Grad: 2.0381  
Epoch: [1][42940/51233] Data 0.317 (0.310) Elapsed 381m 59s (remain 73m 45s) Loss: 0.0122(0.0579) Grad: 0.1952  
Epoch: [1][42960/51233] Data 0.315 (0.310) Elapsed 382m 9s (remain 73m 35s) Loss: 0.0315(0.0579) Grad: 0.5890  
Epoch: [1][42980/51233] Data 0.314 (0.310) Elapsed 382m 20s (remain 73m 24s) Loss: 0.0775(0.0579) Grad: 0.8617  
Epoch: [1][43000/51233] Data 0.306 (0.310) Elapsed 382m 31s (remain 73m 13s) Loss: 0.0247(0.0579) Grad: 0.5223  
Epoch: [1][43020/51233] Data 0.306 (0.310) Elapsed 382m 42s (remain 73m 3s) Loss: 0.0592(0.0579) Grad: 1.1559  
Epoch: [1][43040/51233] Data 0.307 (0.310) Elapsed 382m 52s (remain 72m 52s) Loss: 0.2976(0.0579) Grad: 1.8305  
Epoch: [1][43060/51233] Data 0.300 (0.310) Elapsed 383m 3s (remain 72m 41s) Loss: 0.0041(0.0579) Grad: 0.0385  
Epoch: [1][43080/51233] Data 0.309 (0.310) Elapsed 383m 14s (remain 72m 31s) Loss: 0.0038(0.0579) Grad: 0.0445  
Epoch: [1][43100/51233] Data 0.308 (0.310) Elapsed 383m 24s (remain 72m 20s) Loss: 0.0013(0.0579) Grad: 0.0204  
Epoch: [1][43120/51233] Data 0.308 (0.310) Elapsed 383m 35s (remain 72m 9s) Loss: 0.1657(0.0579) Grad: 1.3955  
Epoch: [1][43140/51233] Data 0.317 (0.310) Elapsed 383m 46s (remain 71m 59s) Loss: 0.1866(0.0579) Grad: 2.2850  
Epoch: [1][43160/51233] Data 0.314 (0.310) Elapsed 383m 56s (remain 71m 48s) Loss: 0.1349(0.0579) Grad: 1.5318  
Epoch: [1][43180/51233] Data 0.307 (0.310) Elapsed 384m 7s (remain 71m 37s) Loss: 0.0100(0.0579) Grad: 0.1643  
Epoch: [1][43200/51233] Data 0.317 (0.310) Elapsed 384m 18s (remain 71m 27s) Loss: 0.0042(0.0579) Grad: 0.0882  
Epoch: [1][43220/51233] Data 0.302 (0.310) Elapsed 384m 28s (remain 71m 16s) Loss: 0.0050(0.0579) Grad: 0.0713  
Epoch: [1][43240/51233] Data 0.309 (0.310) Elapsed 384m 39s (remain 71m 5s) Loss: 0.0117(0.0579) Grad: 0.1180  
Epoch: [1][43260/51233] Data 0.318 (0.310) Elapsed 384m 50s (remain 70m 55s) Loss: 0.0098(0.0579) Grad: 0.1813  
Epoch: [1][43280/51233] Data 0.296 (0.310) Elapsed 385m 0s (remain 70m 44s) Loss: 0.0060(0.0579) Grad: 0.0575  
Epoch: [1][43300/51233] Data 0.306 (0.310) Elapsed 385m 11s (remain 70m 33s) Loss: 0.0145(0.0579) Grad: 0.2657  
Epoch: [1][43320/51233] Data 0.308 (0.310) Elapsed 385m 22s (remain 70m 22s) Loss: 0.1930(0.0579) Grad: 1.1386  
Epoch: [1][43340/51233] Data 0.315 (0.310) Elapsed 385m 33s (remain 70m 12s) Loss: 0.0240(0.0579) Grad: 0.3668  
Epoch: [1][43360/51233] Data 0.307 (0.310) Elapsed 385m 43s (remain 70m 1s) Loss: 0.0659(0.0579) Grad: 1.2207  
Epoch: [1][43380/51233] Data 0.308 (0.310) Elapsed 385m 54s (remain 69m 50s) Loss: 0.0598(0.0579) Grad: 1.0092  
Epoch: [1][43400/51233] Data 0.312 (0.310) Elapsed 386m 5s (remain 69m 40s) Loss: 0.0069(0.0579) Grad: 0.0908  
Epoch: [1][43420/51233] Data 0.318 (0.310) Elapsed 386m 15s (remain 69m 29s) Loss: 0.0043(0.0579) Grad: 0.0552  
Epoch: [1][43440/51233] Data 0.318 (0.310) Elapsed 386m 26s (remain 69m 18s) Loss: 0.0769(0.0579) Grad: 0.7825  
Epoch: [1][43460/51233] Data 0.317 (0.310) Elapsed 386m 37s (remain 69m 8s) Loss: 0.0017(0.0579) Grad: 0.0393  
Epoch: [1][43480/51233] Data 0.309 (0.310) Elapsed 386m 47s (remain 68m 57s) Loss: 0.0034(0.0579) Grad: 0.0732  
Epoch: [1][43500/51233] Data 0.311 (0.310) Elapsed 386m 58s (remain 68m 46s) Loss: 0.0089(0.0579) Grad: 0.1294  
Epoch: [1][43520/51233] Data 0.314 (0.310) Elapsed 387m 9s (remain 68m 36s) Loss: 0.1224(0.0579) Grad: 2.0754  
Epoch: [1][43540/51233] Data 0.309 (0.310) Elapsed 387m 19s (remain 68m 25s) Loss: 0.0315(0.0579) Grad: 0.4680  
Epoch: [1][43560/51233] Data 0.310 (0.310) Elapsed 387m 30s (remain 68m 14s) Loss: 0.0015(0.0579) Grad: 0.0356  
Epoch: [1][43580/51233] Data 0.312 (0.310) Elapsed 387m 41s (remain 68m 4s) Loss: 0.0847(0.0579) Grad: 1.3871  
Epoch: [1][43600/51233] Data 0.310 (0.310) Elapsed 387m 51s (remain 67m 53s) Loss: 0.0135(0.0579) Grad: 0.3316  
Epoch: [1][43620/51233] Data 0.318 (0.310) Elapsed 388m 2s (remain 67m 42s) Loss: 0.0233(0.0579) Grad: 0.3995  
Epoch: [1][43640/51233] Data 0.308 (0.310) Elapsed 388m 13s (remain 67m 32s) Loss: 0.0067(0.0579) Grad: 0.0929  
Epoch: [1][43660/51233] Data 0.313 (0.310) Elapsed 388m 24s (remain 67m 21s) Loss: 0.1916(0.0579) Grad: 1.7648  
Epoch: [1][43680/51233] Data 0.305 (0.310) Elapsed 388m 34s (remain 67m 10s) Loss: 0.0264(0.0579) Grad: 0.4710  
Epoch: [1][43700/51233] Data 0.308 (0.310) Elapsed 388m 45s (remain 67m 0s) Loss: 0.0646(0.0579) Grad: 1.4096  
Epoch: [1][43720/51233] Data 0.318 (0.310) Elapsed 388m 56s (remain 66m 49s) Loss: 0.1033(0.0579) Grad: 1.0701  
Epoch: [1][43740/51233] Data 0.298 (0.310) Elapsed 389m 6s (remain 66m 38s) Loss: 0.0066(0.0579) Grad: 0.0657  
Epoch: [1][43760/51233] Data 0.317 (0.310) Elapsed 389m 17s (remain 66m 28s) Loss: 0.1900(0.0579) Grad: 1.3837  
Epoch: [1][43780/51233] Data 0.311 (0.310) Elapsed 389m 28s (remain 66m 17s) Loss: 0.0402(0.0579) Grad: 0.5665  
Epoch: [1][43800/51233] Data 0.314 (0.310) Elapsed 389m 38s (remain 66m 6s) Loss: 0.0026(0.0579) Grad: 0.0450  
Epoch: [1][43820/51233] Data 0.308 (0.310) Elapsed 389m 49s (remain 65m 56s) Loss: 0.0057(0.0579) Grad: 0.0896  
Epoch: [1][43840/51233] Data 0.306 (0.310) Elapsed 390m 0s (remain 65m 45s) Loss: 0.2190(0.0579) Grad: 1.9404  
Epoch: [1][43860/51233] Data 0.312 (0.310) Elapsed 390m 10s (remain 65m 34s) Loss: 0.0126(0.0579) Grad: 0.1578  
Epoch: [1][43880/51233] Data 0.305 (0.310) Elapsed 390m 21s (remain 65m 24s) Loss: 0.0574(0.0579) Grad: 0.7541  
Epoch: [1][43900/51233] Data 0.309 (0.310) Elapsed 390m 32s (remain 65m 13s) Loss: 0.0037(0.0579) Grad: 0.0670  
Epoch: [1][43920/51233] Data 0.317 (0.310) Elapsed 390m 42s (remain 65m 2s) Loss: 0.0009(0.0578) Grad: 0.0187  
Epoch: [1][43940/51233] Data 0.306 (0.310) Elapsed 390m 53s (remain 64m 52s) Loss: 0.0220(0.0579) Grad: 0.3369  
Epoch: [1][43960/51233] Data 0.307 (0.310) Elapsed 391m 4s (remain 64m 41s) Loss: 0.0043(0.0578) Grad: 0.0808  
Epoch: [1][43980/51233] Data 0.314 (0.310) Elapsed 391m 15s (remain 64m 30s) Loss: 0.0154(0.0578) Grad: 0.3238  
Epoch: [1][44000/51233] Data 0.317 (0.310) Elapsed 391m 25s (remain 64m 20s) Loss: 0.0080(0.0578) Grad: 0.1760  
Epoch: [1][44020/51233] Data 0.305 (0.310) Elapsed 391m 36s (remain 64m 9s) Loss: 0.0177(0.0578) Grad: 0.2110  
Epoch: [1][44040/51233] Data 0.318 (0.310) Elapsed 391m 47s (remain 63m 58s) Loss: 0.0300(0.0578) Grad: 0.4548  
Epoch: [1][44060/51233] Data 0.307 (0.310) Elapsed 391m 57s (remain 63m 48s) Loss: 0.0020(0.0578) Grad: 0.0275  
Epoch: [1][44080/51233] Data 0.304 (0.310) Elapsed 392m 8s (remain 63m 37s) Loss: 0.1345(0.0578) Grad: 2.2384  
Epoch: [1][44100/51233] Data 0.317 (0.310) Elapsed 392m 19s (remain 63m 26s) Loss: 0.0208(0.0578) Grad: 0.4817  
Epoch: [1][44120/51233] Data 0.298 (0.310) Elapsed 392m 29s (remain 63m 16s) Loss: 0.0580(0.0578) Grad: 0.4923  
Epoch: [1][44140/51233] Data 0.306 (0.310) Elapsed 392m 40s (remain 63m 5s) Loss: 0.1188(0.0578) Grad: 1.1318  
Epoch: [1][44160/51233] Data 0.305 (0.310) Elapsed 392m 51s (remain 62m 54s) Loss: 0.0144(0.0578) Grad: 0.2597  
Epoch: [1][44180/51233] Data 0.315 (0.310) Elapsed 393m 1s (remain 62m 44s) Loss: 0.0049(0.0578) Grad: 0.0900  
Epoch: [1][44200/51233] Data 0.316 (0.310) Elapsed 393m 12s (remain 62m 33s) Loss: 0.0021(0.0578) Grad: 0.0390  
Epoch: [1][44220/51233] Data 0.316 (0.310) Elapsed 393m 23s (remain 62m 22s) Loss: 0.0084(0.0578) Grad: 0.0956  
Epoch: [1][44240/51233] Data 0.311 (0.310) Elapsed 393m 33s (remain 62m 12s) Loss: 0.0151(0.0578) Grad: 0.1351  
Epoch: [1][44260/51233] Data 0.294 (0.310) Elapsed 393m 44s (remain 62m 1s) Loss: 0.0010(0.0578) Grad: 0.0153  
Epoch: [1][44280/51233] Data 0.308 (0.310) Elapsed 393m 55s (remain 61m 50s) Loss: 0.0029(0.0578) Grad: 0.0333  
Epoch: [1][44300/51233] Data 0.318 (0.310) Elapsed 394m 5s (remain 61m 40s) Loss: 0.0007(0.0578) Grad: 0.0077  
Epoch: [1][44320/51233] Data 0.308 (0.310) Elapsed 394m 16s (remain 61m 29s) Loss: 0.0080(0.0578) Grad: 0.1566  
Epoch: [1][44340/51233] Data 0.309 (0.310) Elapsed 394m 27s (remain 61m 18s) Loss: 0.0555(0.0578) Grad: 0.8425  
Epoch: [1][44360/51233] Data 0.317 (0.310) Elapsed 394m 38s (remain 61m 7s) Loss: 0.0139(0.0578) Grad: 0.1761  
Epoch: [1][44380/51233] Data 0.315 (0.310) Elapsed 394m 48s (remain 60m 57s) Loss: 0.0358(0.0578) Grad: 0.7956  
Epoch: [1][44400/51233] Data 0.315 (0.310) Elapsed 394m 59s (remain 60m 46s) Loss: 0.0018(0.0578) Grad: 0.0350  
Epoch: [1][44420/51233] Data 0.305 (0.310) Elapsed 395m 10s (remain 60m 35s) Loss: 0.0030(0.0578) Grad: 0.0769  
Epoch: [1][44440/51233] Data 0.312 (0.310) Elapsed 395m 20s (remain 60m 25s) Loss: 0.0369(0.0578) Grad: 0.5310  
Epoch: [1][44460/51233] Data 0.318 (0.310) Elapsed 395m 31s (remain 60m 14s) Loss: 0.0060(0.0578) Grad: 0.0819  
Epoch: [1][44480/51233] Data 0.306 (0.310) Elapsed 395m 42s (remain 60m 3s) Loss: 0.0067(0.0578) Grad: 0.0944  
Epoch: [1][44500/51233] Data 0.314 (0.310) Elapsed 395m 52s (remain 59m 53s) Loss: 0.0069(0.0578) Grad: 0.1305  
Epoch: [1][44520/51233] Data 0.318 (0.310) Elapsed 396m 3s (remain 59m 42s) Loss: 0.0176(0.0578) Grad: 0.3210  
Epoch: [1][44540/51233] Data 0.305 (0.310) Elapsed 396m 14s (remain 59m 31s) Loss: 0.0028(0.0578) Grad: 0.0434  
Epoch: [1][44560/51233] Data 0.317 (0.310) Elapsed 396m 24s (remain 59m 21s) Loss: 0.0115(0.0578) Grad: 0.1666  
Epoch: [1][44580/51233] Data 0.305 (0.310) Elapsed 396m 35s (remain 59m 10s) Loss: 0.0281(0.0578) Grad: 0.3991  
Epoch: [1][44600/51233] Data 0.296 (0.310) Elapsed 396m 46s (remain 58m 59s) Loss: 0.0020(0.0578) Grad: 0.0357  
Epoch: [1][44620/51233] Data 0.318 (0.310) Elapsed 396m 57s (remain 58m 49s) Loss: 0.0010(0.0577) Grad: 0.0159  
Epoch: [1][44640/51233] Data 0.312 (0.310) Elapsed 397m 7s (remain 58m 38s) Loss: 0.0090(0.0577) Grad: 0.1776  
Epoch: [1][44660/51233] Data 0.316 (0.310) Elapsed 397m 18s (remain 58m 27s) Loss: 0.0011(0.0577) Grad: 0.0166  
Epoch: [1][44680/51233] Data 0.317 (0.310) Elapsed 397m 29s (remain 58m 17s) Loss: 0.0013(0.0577) Grad: 0.0297  
Epoch: [1][44700/51233] Data 0.318 (0.310) Elapsed 397m 39s (remain 58m 6s) Loss: 0.0069(0.0577) Grad: 0.1652  
Epoch: [1][44720/51233] Data 0.309 (0.310) Elapsed 397m 50s (remain 57m 55s) Loss: 0.1997(0.0577) Grad: 1.6470  
Epoch: [1][44740/51233] Data 0.316 (0.310) Elapsed 398m 1s (remain 57m 45s) Loss: 0.0063(0.0577) Grad: 0.1704  
Epoch: [1][44760/51233] Data 0.290 (0.310) Elapsed 398m 11s (remain 57m 34s) Loss: 0.0155(0.0577) Grad: 0.2877  
Epoch: [1][44780/51233] Data 0.308 (0.310) Elapsed 398m 22s (remain 57m 23s) Loss: 0.0315(0.0577) Grad: 0.4416  
Epoch: [1][44800/51233] Data 0.308 (0.310) Elapsed 398m 33s (remain 57m 13s) Loss: 0.0054(0.0577) Grad: 0.1187  
Epoch: [1][44820/51233] Data 0.309 (0.310) Elapsed 398m 43s (remain 57m 2s) Loss: 0.0386(0.0577) Grad: 0.5796  
Epoch: [1][44840/51233] Data 0.309 (0.310) Elapsed 398m 54s (remain 56m 51s) Loss: 0.0087(0.0577) Grad: 0.1680  
Epoch: [1][44860/51233] Data 0.317 (0.310) Elapsed 399m 5s (remain 56m 41s) Loss: 0.0065(0.0577) Grad: 0.1460  
Epoch: [1][44880/51233] Data 0.309 (0.310) Elapsed 399m 15s (remain 56m 30s) Loss: 0.0507(0.0577) Grad: 0.7912  
Epoch: [1][44900/51233] Data 0.307 (0.310) Elapsed 399m 26s (remain 56m 19s) Loss: 0.0055(0.0577) Grad: 0.0731  
Epoch: [1][44920/51233] Data 0.317 (0.310) Elapsed 399m 37s (remain 56m 9s) Loss: 0.0192(0.0577) Grad: 0.6145  
Epoch: [1][44940/51233] Data 0.309 (0.310) Elapsed 399m 48s (remain 55m 58s) Loss: 0.0342(0.0577) Grad: 0.8417  
Epoch: [1][44960/51233] Data 0.305 (0.310) Elapsed 399m 58s (remain 55m 47s) Loss: 0.0136(0.0577) Grad: 0.2478  
Epoch: [1][44980/51233] Data 0.308 (0.310) Elapsed 400m 9s (remain 55m 37s) Loss: 0.0886(0.0577) Grad: 0.8671  
Epoch: [1][45000/51233] Data 0.318 (0.310) Elapsed 400m 20s (remain 55m 26s) Loss: 0.0138(0.0577) Grad: 0.3319  
Epoch: [1][45020/51233] Data 0.310 (0.310) Elapsed 400m 30s (remain 55m 15s) Loss: 0.0164(0.0577) Grad: 0.2751  
Epoch: [1][45040/51233] Data 0.318 (0.310) Elapsed 400m 41s (remain 55m 5s) Loss: 0.0017(0.0577) Grad: 0.0179  
Epoch: [1][45060/51233] Data 0.302 (0.310) Elapsed 400m 52s (remain 54m 54s) Loss: 0.0030(0.0577) Grad: 0.0434  
Epoch: [1][45080/51233] Data 0.295 (0.310) Elapsed 401m 2s (remain 54m 43s) Loss: 0.1777(0.0577) Grad: 1.2557  
Epoch: [1][45100/51233] Data 0.317 (0.310) Elapsed 401m 13s (remain 54m 33s) Loss: 0.0329(0.0577) Grad: 0.4908  
Epoch: [1][45120/51233] Data 0.317 (0.310) Elapsed 401m 24s (remain 54m 22s) Loss: 0.0098(0.0577) Grad: 0.1968  
Epoch: [1][45140/51233] Data 0.313 (0.310) Elapsed 401m 34s (remain 54m 11s) Loss: 0.0261(0.0577) Grad: 0.4971  
Epoch: [1][45160/51233] Data 0.315 (0.310) Elapsed 401m 45s (remain 54m 1s) Loss: 0.2661(0.0577) Grad: 1.1317  
Epoch: [1][45180/51233] Data 0.318 (0.310) Elapsed 401m 56s (remain 53m 50s) Loss: 0.0117(0.0576) Grad: 0.2584  
Epoch: [1][45200/51233] Data 0.309 (0.310) Elapsed 402m 6s (remain 53m 39s) Loss: 0.0077(0.0576) Grad: 0.1014  
Epoch: [1][45220/51233] Data 0.317 (0.310) Elapsed 402m 17s (remain 53m 29s) Loss: 0.0440(0.0576) Grad: 0.6348  
Epoch: [1][45240/51233] Data 0.317 (0.310) Elapsed 402m 28s (remain 53m 18s) Loss: 0.0188(0.0576) Grad: 0.3456  
Epoch: [1][45260/51233] Data 0.306 (0.310) Elapsed 402m 39s (remain 53m 7s) Loss: 0.0045(0.0576) Grad: 0.0611  
Epoch: [1][45280/51233] Data 0.304 (0.310) Elapsed 402m 49s (remain 52m 57s) Loss: 0.0027(0.0576) Grad: 0.0276  
Epoch: [1][45300/51233] Data 0.309 (0.310) Elapsed 403m 0s (remain 52m 46s) Loss: 0.4284(0.0576) Grad: 2.5467  
Epoch: [1][45320/51233] Data 0.294 (0.310) Elapsed 403m 11s (remain 52m 35s) Loss: 0.0457(0.0576) Grad: 0.5912  
Epoch: [1][45340/51233] Data 0.305 (0.310) Elapsed 403m 21s (remain 52m 24s) Loss: 0.0009(0.0576) Grad: 0.0089  
Epoch: [1][45360/51233] Data 0.317 (0.310) Elapsed 403m 32s (remain 52m 14s) Loss: 0.0066(0.0576) Grad: 0.0676  
Epoch: [1][45380/51233] Data 0.296 (0.310) Elapsed 403m 43s (remain 52m 3s) Loss: 0.0190(0.0576) Grad: 0.2726  
Epoch: [1][45400/51233] Data 0.301 (0.310) Elapsed 403m 53s (remain 51m 52s) Loss: 0.0019(0.0576) Grad: 0.0292  
Epoch: [1][45420/51233] Data 0.309 (0.310) Elapsed 404m 4s (remain 51m 42s) Loss: 0.0152(0.0576) Grad: 0.1761  
Epoch: [1][45440/51233] Data 0.312 (0.310) Elapsed 404m 15s (remain 51m 31s) Loss: 0.0026(0.0576) Grad: 0.0276  
Epoch: [1][45460/51233] Data 0.318 (0.310) Elapsed 404m 25s (remain 51m 20s) Loss: 0.0078(0.0576) Grad: 0.1007  
Epoch: [1][45480/51233] Data 0.318 (0.310) Elapsed 404m 36s (remain 51m 10s) Loss: 0.0035(0.0576) Grad: 0.0796  
Epoch: [1][45500/51233] Data 0.314 (0.310) Elapsed 404m 47s (remain 50m 59s) Loss: 0.0074(0.0576) Grad: 0.1704  
Epoch: [1][45520/51233] Data 0.309 (0.310) Elapsed 404m 57s (remain 50m 48s) Loss: 0.0365(0.0576) Grad: 1.1882  
Epoch: [1][45540/51233] Data 0.317 (0.310) Elapsed 405m 8s (remain 50m 38s) Loss: 0.0111(0.0576) Grad: 0.1882  
Epoch: [1][45560/51233] Data 0.316 (0.310) Elapsed 405m 19s (remain 50m 27s) Loss: 0.0054(0.0576) Grad: 0.1238  
Epoch: [1][45580/51233] Data 0.313 (0.310) Elapsed 405m 29s (remain 50m 16s) Loss: 0.0054(0.0576) Grad: 0.0850  
Epoch: [1][45600/51233] Data 0.316 (0.310) Elapsed 405m 40s (remain 50m 6s) Loss: 0.0015(0.0576) Grad: 0.0203  
Epoch: [1][45620/51233] Data 0.310 (0.310) Elapsed 405m 51s (remain 49m 55s) Loss: 0.0007(0.0576) Grad: 0.0093  
Epoch: [1][45640/51233] Data 0.315 (0.310) Elapsed 406m 2s (remain 49m 44s) Loss: 0.0015(0.0576) Grad: 0.0229  
Epoch: [1][45660/51233] Data 0.318 (0.310) Elapsed 406m 12s (remain 49m 34s) Loss: 0.0303(0.0576) Grad: 0.3992  
Epoch: [1][45680/51233] Data 0.317 (0.310) Elapsed 406m 23s (remain 49m 23s) Loss: 0.0588(0.0576) Grad: 0.7978  
Epoch: [1][45700/51233] Data 0.317 (0.310) Elapsed 406m 34s (remain 49m 12s) Loss: 0.0025(0.0576) Grad: 0.0403  
Epoch: [1][45720/51233] Data 0.314 (0.310) Elapsed 406m 44s (remain 49m 2s) Loss: 0.0399(0.0575) Grad: 0.8083  
Epoch: [1][45740/51233] Data 0.317 (0.310) Elapsed 406m 55s (remain 48m 51s) Loss: 0.0016(0.0576) Grad: 0.0243  
Epoch: [1][45760/51233] Data 0.307 (0.310) Elapsed 407m 6s (remain 48m 40s) Loss: 0.0647(0.0575) Grad: 0.9403  
Epoch: [1][45780/51233] Data 0.318 (0.310) Elapsed 407m 16s (remain 48m 30s) Loss: 0.0182(0.0575) Grad: 0.2838  
Epoch: [1][45800/51233] Data 0.318 (0.310) Elapsed 407m 27s (remain 48m 19s) Loss: 0.0013(0.0575) Grad: 0.0226  
Epoch: [1][45820/51233] Data 0.313 (0.310) Elapsed 407m 38s (remain 48m 8s) Loss: 0.1020(0.0575) Grad: 1.1457  
Epoch: [1][45840/51233] Data 0.314 (0.310) Elapsed 407m 48s (remain 47m 58s) Loss: 0.1901(0.0575) Grad: 2.7210  
Epoch: [1][45860/51233] Data 0.307 (0.310) Elapsed 407m 59s (remain 47m 47s) Loss: 0.0116(0.0575) Grad: 0.2298  
Epoch: [1][45880/51233] Data 0.306 (0.310) Elapsed 408m 10s (remain 47m 36s) Loss: 0.0107(0.0575) Grad: 0.3799  
Epoch: [1][45900/51233] Data 0.317 (0.310) Elapsed 408m 20s (remain 47m 26s) Loss: 0.0048(0.0575) Grad: 0.0844  
Epoch: [1][45920/51233] Data 0.316 (0.310) Elapsed 408m 31s (remain 47m 15s) Loss: 0.0018(0.0575) Grad: 0.0208  
Epoch: [1][45940/51233] Data 0.311 (0.310) Elapsed 408m 42s (remain 47m 4s) Loss: 0.1759(0.0575) Grad: 1.5989  
Epoch: [1][45960/51233] Data 0.308 (0.310) Elapsed 408m 53s (remain 46m 54s) Loss: 0.0072(0.0575) Grad: 0.2348  
Epoch: [1][45980/51233] Data 0.314 (0.310) Elapsed 409m 3s (remain 46m 43s) Loss: 0.0118(0.0575) Grad: 0.2391  
Epoch: [1][46000/51233] Data 0.296 (0.310) Elapsed 409m 14s (remain 46m 32s) Loss: 0.1910(0.0575) Grad: 0.8727  
Epoch: [1][46020/51233] Data 0.307 (0.310) Elapsed 409m 25s (remain 46m 22s) Loss: 0.0050(0.0575) Grad: 0.0802  
Epoch: [1][46040/51233] Data 0.317 (0.310) Elapsed 409m 35s (remain 46m 11s) Loss: 0.0082(0.0575) Grad: 0.2013  
Epoch: [1][46060/51233] Data 0.317 (0.310) Elapsed 409m 46s (remain 46m 0s) Loss: 0.0077(0.0575) Grad: 0.1743  
Epoch: [1][46080/51233] Data 0.307 (0.310) Elapsed 409m 57s (remain 45m 50s) Loss: 0.0260(0.0575) Grad: 0.3992  
Epoch: [1][46100/51233] Data 0.296 (0.310) Elapsed 410m 7s (remain 45m 39s) Loss: 0.0369(0.0575) Grad: 0.3845  
Epoch: [1][46120/51233] Data 0.317 (0.310) Elapsed 410m 18s (remain 45m 28s) Loss: 0.0095(0.0575) Grad: 0.1198  
Epoch: [1][46140/51233] Data 0.317 (0.310) Elapsed 410m 29s (remain 45m 18s) Loss: 0.0036(0.0575) Grad: 0.0652  
Epoch: [1][46160/51233] Data 0.309 (0.310) Elapsed 410m 39s (remain 45m 7s) Loss: 0.0749(0.0575) Grad: 0.8586  
Epoch: [1][46180/51233] Data 0.318 (0.310) Elapsed 410m 50s (remain 44m 56s) Loss: 0.0246(0.0575) Grad: 0.3843  
Epoch: [1][46200/51233] Data 0.308 (0.310) Elapsed 411m 1s (remain 44m 45s) Loss: 0.0027(0.0575) Grad: 0.0380  
Epoch: [1][46220/51233] Data 0.317 (0.310) Elapsed 411m 11s (remain 44m 35s) Loss: 0.0393(0.0574) Grad: 0.4296  
Epoch: [1][46240/51233] Data 0.308 (0.310) Elapsed 411m 22s (remain 44m 24s) Loss: 0.0091(0.0574) Grad: 0.1780  
Epoch: [1][46260/51233] Data 0.304 (0.310) Elapsed 411m 33s (remain 44m 13s) Loss: 0.0018(0.0574) Grad: 0.0200  
Epoch: [1][46280/51233] Data 0.293 (0.310) Elapsed 411m 43s (remain 44m 3s) Loss: 0.0045(0.0574) Grad: 0.0572  
Epoch: [1][46300/51233] Data 0.293 (0.310) Elapsed 411m 54s (remain 43m 52s) Loss: 0.0075(0.0574) Grad: 0.0728  
Epoch: [1][46320/51233] Data 0.313 (0.310) Elapsed 412m 5s (remain 43m 41s) Loss: 0.0325(0.0574) Grad: 0.9312  
Epoch: [1][46340/51233] Data 0.317 (0.310) Elapsed 412m 16s (remain 43m 31s) Loss: 0.0731(0.0574) Grad: 1.3281  
Epoch: [1][46360/51233] Data 0.314 (0.310) Elapsed 412m 26s (remain 43m 20s) Loss: 0.0059(0.0574) Grad: 0.1374  
Epoch: [1][46380/51233] Data 0.305 (0.310) Elapsed 412m 37s (remain 43m 9s) Loss: 0.0031(0.0574) Grad: 0.0428  
Epoch: [1][46400/51233] Data 0.297 (0.310) Elapsed 412m 48s (remain 42m 59s) Loss: 0.0107(0.0574) Grad: 0.1946  
Epoch: [1][46420/51233] Data 0.308 (0.310) Elapsed 412m 58s (remain 42m 48s) Loss: 0.0168(0.0574) Grad: 0.1955  
Epoch: [1][46440/51233] Data 0.306 (0.310) Elapsed 413m 9s (remain 42m 37s) Loss: 0.0152(0.0574) Grad: 0.3390  
Epoch: [1][46460/51233] Data 0.317 (0.310) Elapsed 413m 20s (remain 42m 27s) Loss: 0.0120(0.0574) Grad: 0.2381  
Epoch: [1][46480/51233] Data 0.318 (0.310) Elapsed 413m 30s (remain 42m 16s) Loss: 0.0324(0.0574) Grad: 0.3302  
Epoch: [1][46500/51233] Data 0.307 (0.310) Elapsed 413m 41s (remain 42m 5s) Loss: 0.1022(0.0574) Grad: 1.3134  
Epoch: [1][46520/51233] Data 0.310 (0.310) Elapsed 413m 52s (remain 41m 55s) Loss: 0.0067(0.0574) Grad: 0.1195  
Epoch: [1][46540/51233] Data 0.313 (0.310) Elapsed 414m 2s (remain 41m 44s) Loss: 0.1442(0.0574) Grad: 1.2847  
Epoch: [1][46560/51233] Data 0.308 (0.310) Elapsed 414m 13s (remain 41m 33s) Loss: 0.0081(0.0574) Grad: 0.2397  
Epoch: [1][46580/51233] Data 0.309 (0.310) Elapsed 414m 24s (remain 41m 23s) Loss: 0.3664(0.0574) Grad: 2.7911  
Epoch: [1][46600/51233] Data 0.296 (0.310) Elapsed 414m 34s (remain 41m 12s) Loss: 0.0048(0.0574) Grad: 0.0551  
Epoch: [1][46620/51233] Data 0.308 (0.310) Elapsed 414m 45s (remain 41m 1s) Loss: 0.1155(0.0574) Grad: 1.4211  
Epoch: [1][46640/51233] Data 0.318 (0.310) Elapsed 414m 56s (remain 40m 51s) Loss: 0.0014(0.0574) Grad: 0.0139  
Epoch: [1][46660/51233] Data 0.317 (0.310) Elapsed 415m 6s (remain 40m 40s) Loss: 0.0286(0.0574) Grad: 0.3674  
Epoch: [1][46680/51233] Data 0.305 (0.310) Elapsed 415m 17s (remain 40m 29s) Loss: 0.0155(0.0574) Grad: 0.2642  
Epoch: [1][46700/51233] Data 0.311 (0.310) Elapsed 415m 28s (remain 40m 19s) Loss: 0.0128(0.0574) Grad: 0.2321  
Epoch: [1][46720/51233] Data 0.317 (0.310) Elapsed 415m 38s (remain 40m 8s) Loss: 0.1250(0.0574) Grad: 1.4840  
Epoch: [1][46740/51233] Data 0.317 (0.310) Elapsed 415m 49s (remain 39m 57s) Loss: 0.0011(0.0574) Grad: 0.0123  
Epoch: [1][46760/51233] Data 0.318 (0.310) Elapsed 416m 0s (remain 39m 47s) Loss: 0.0237(0.0574) Grad: 0.5397  
Epoch: [1][46780/51233] Data 0.318 (0.310) Elapsed 416m 10s (remain 39m 36s) Loss: 0.0442(0.0574) Grad: 0.5716  
Epoch: [1][46800/51233] Data 0.311 (0.310) Elapsed 416m 21s (remain 39m 25s) Loss: 0.0595(0.0574) Grad: 1.3503  
Epoch: [1][46820/51233] Data 0.306 (0.310) Elapsed 416m 32s (remain 39m 15s) Loss: 0.0134(0.0574) Grad: 0.2416  
Epoch: [1][46840/51233] Data 0.317 (0.310) Elapsed 416m 42s (remain 39m 4s) Loss: 0.0154(0.0574) Grad: 0.1828  
Epoch: [1][46860/51233] Data 0.304 (0.310) Elapsed 416m 53s (remain 38m 53s) Loss: 0.0092(0.0574) Grad: 0.1737  
Epoch: [1][46880/51233] Data 0.316 (0.310) Elapsed 417m 4s (remain 38m 43s) Loss: 0.0160(0.0574) Grad: 0.3351  
Epoch: [1][46900/51233] Data 0.309 (0.310) Elapsed 417m 14s (remain 38m 32s) Loss: 0.0179(0.0574) Grad: 0.2427  
Epoch: [1][46920/51233] Data 0.306 (0.310) Elapsed 417m 25s (remain 38m 21s) Loss: 0.0046(0.0574) Grad: 0.0667  
Epoch: [1][46940/51233] Data 0.317 (0.310) Elapsed 417m 36s (remain 38m 10s) Loss: 0.2159(0.0574) Grad: 1.1853  
Epoch: [1][46960/51233] Data 0.313 (0.310) Elapsed 417m 46s (remain 38m 0s) Loss: 0.0123(0.0574) Grad: 0.1366  
Epoch: [1][46980/51233] Data 0.303 (0.310) Elapsed 417m 57s (remain 37m 49s) Loss: 0.0190(0.0574) Grad: 0.5270  
Epoch: [1][47000/51233] Data 0.313 (0.310) Elapsed 418m 8s (remain 37m 38s) Loss: 0.0024(0.0574) Grad: 0.0385  
Epoch: [1][47020/51233] Data 0.313 (0.310) Elapsed 418m 18s (remain 37m 28s) Loss: 0.0055(0.0574) Grad: 0.0940  
Epoch: [1][47040/51233] Data 0.306 (0.310) Elapsed 418m 29s (remain 37m 17s) Loss: 0.0547(0.0574) Grad: 0.7091  
Epoch: [1][47060/51233] Data 0.309 (0.310) Elapsed 418m 40s (remain 37m 6s) Loss: 0.0388(0.0573) Grad: 0.9350  
Epoch: [1][47080/51233] Data 0.307 (0.310) Elapsed 418m 50s (remain 36m 56s) Loss: 0.0010(0.0573) Grad: 0.0137  
Epoch: [1][47100/51233] Data 0.307 (0.310) Elapsed 419m 1s (remain 36m 45s) Loss: 0.0016(0.0573) Grad: 0.0199  
Epoch: [1][47120/51233] Data 0.318 (0.310) Elapsed 419m 12s (remain 36m 34s) Loss: 0.2815(0.0573) Grad: 1.7627  
Epoch: [1][47140/51233] Data 0.313 (0.310) Elapsed 419m 23s (remain 36m 24s) Loss: 0.0089(0.0573) Grad: 0.2391  
Epoch: [1][47160/51233] Data 0.317 (0.310) Elapsed 419m 33s (remain 36m 13s) Loss: 0.0091(0.0573) Grad: 0.1234  
Epoch: [1][47180/51233] Data 0.295 (0.310) Elapsed 419m 44s (remain 36m 2s) Loss: 0.0049(0.0573) Grad: 0.1031  
Epoch: [1][47200/51233] Data 0.311 (0.310) Elapsed 419m 55s (remain 35m 52s) Loss: 0.0033(0.0573) Grad: 0.0851  
Epoch: [1][47220/51233] Data 0.317 (0.310) Elapsed 420m 5s (remain 35m 41s) Loss: 0.0011(0.0573) Grad: 0.0489  
Epoch: [1][47240/51233] Data 0.306 (0.310) Elapsed 420m 16s (remain 35m 30s) Loss: 0.0669(0.0573) Grad: 1.0532  
Epoch: [1][47260/51233] Data 0.307 (0.310) Elapsed 420m 27s (remain 35m 20s) Loss: 0.0173(0.0573) Grad: 0.2656  
Epoch: [1][47280/51233] Data 0.302 (0.310) Elapsed 420m 37s (remain 35m 9s) Loss: 0.0480(0.0573) Grad: 0.7698  
Epoch: [1][47300/51233] Data 0.318 (0.310) Elapsed 420m 48s (remain 34m 58s) Loss: 0.0012(0.0573) Grad: 0.0141  
Epoch: [1][47320/51233] Data 0.303 (0.310) Elapsed 420m 59s (remain 34m 48s) Loss: 0.0199(0.0573) Grad: 0.3397  
Epoch: [1][47340/51233] Data 0.298 (0.310) Elapsed 421m 9s (remain 34m 37s) Loss: 0.0153(0.0573) Grad: 0.3474  
Epoch: [1][47360/51233] Data 0.304 (0.310) Elapsed 421m 20s (remain 34m 26s) Loss: 0.0043(0.0573) Grad: 0.1054  
Epoch: [1][47380/51233] Data 0.308 (0.310) Elapsed 421m 31s (remain 34m 16s) Loss: 0.0016(0.0573) Grad: 0.0139  
Epoch: [1][47400/51233] Data 0.314 (0.310) Elapsed 421m 41s (remain 34m 5s) Loss: 0.0015(0.0573) Grad: 0.0325  
Epoch: [1][47420/51233] Data 0.307 (0.310) Elapsed 421m 52s (remain 33m 54s) Loss: 0.0069(0.0573) Grad: 0.1024  
Epoch: [1][47440/51233] Data 0.293 (0.310) Elapsed 422m 3s (remain 33m 44s) Loss: 0.0171(0.0573) Grad: 0.2357  
Epoch: [1][47460/51233] Data 0.315 (0.310) Elapsed 422m 14s (remain 33m 33s) Loss: 0.0050(0.0573) Grad: 0.0754  
Epoch: [1][47480/51233] Data 0.318 (0.310) Elapsed 422m 24s (remain 33m 22s) Loss: 0.0081(0.0573) Grad: 0.1526  
Epoch: [1][47500/51233] Data 0.318 (0.310) Elapsed 422m 35s (remain 33m 12s) Loss: 0.0139(0.0572) Grad: 0.3089  
Epoch: [1][47520/51233] Data 0.317 (0.310) Elapsed 422m 46s (remain 33m 1s) Loss: 0.1430(0.0573) Grad: 1.7391  
Epoch: [1][47540/51233] Data 0.310 (0.310) Elapsed 422m 56s (remain 32m 50s) Loss: 0.0102(0.0572) Grad: 0.2195  
Epoch: [1][47560/51233] Data 0.308 (0.310) Elapsed 423m 7s (remain 32m 40s) Loss: 0.0084(0.0572) Grad: 0.1146  
Epoch: [1][47580/51233] Data 0.318 (0.310) Elapsed 423m 18s (remain 32m 29s) Loss: 0.0372(0.0572) Grad: 0.3881  
Epoch: [1][47600/51233] Data 0.318 (0.310) Elapsed 423m 28s (remain 32m 18s) Loss: 0.0944(0.0572) Grad: 1.5792  
Epoch: [1][47620/51233] Data 0.295 (0.310) Elapsed 423m 39s (remain 32m 8s) Loss: 0.1334(0.0572) Grad: 1.3145  
Epoch: [1][47640/51233] Data 0.317 (0.310) Elapsed 423m 50s (remain 31m 57s) Loss: 0.0131(0.0572) Grad: 0.3405  
Epoch: [1][47660/51233] Data 0.317 (0.310) Elapsed 424m 0s (remain 31m 46s) Loss: 0.2876(0.0572) Grad: 2.8932  
Epoch: [1][47680/51233] Data 0.317 (0.310) Elapsed 424m 11s (remain 31m 36s) Loss: 0.0005(0.0572) Grad: 0.0054  
Epoch: [1][47700/51233] Data 0.318 (0.310) Elapsed 424m 22s (remain 31m 25s) Loss: 0.0023(0.0572) Grad: 0.0372  
Epoch: [1][47720/51233] Data 0.309 (0.310) Elapsed 424m 32s (remain 31m 14s) Loss: 0.0038(0.0572) Grad: 0.0462  
Epoch: [1][47740/51233] Data 0.308 (0.310) Elapsed 424m 43s (remain 31m 3s) Loss: 0.2193(0.0572) Grad: 1.5258  
Epoch: [1][47760/51233] Data 0.318 (0.310) Elapsed 424m 54s (remain 30m 53s) Loss: 0.0118(0.0572) Grad: 0.2898  
Epoch: [1][47780/51233] Data 0.309 (0.310) Elapsed 425m 5s (remain 30m 42s) Loss: 0.0038(0.0572) Grad: 0.0491  
Epoch: [1][47800/51233] Data 0.296 (0.310) Elapsed 425m 15s (remain 30m 31s) Loss: 0.0288(0.0572) Grad: 0.6169  
Epoch: [1][47820/51233] Data 0.309 (0.310) Elapsed 425m 26s (remain 30m 21s) Loss: 0.0754(0.0572) Grad: 1.3559  
Epoch: [1][47840/51233] Data 0.308 (0.310) Elapsed 425m 37s (remain 30m 10s) Loss: 0.0357(0.0572) Grad: 0.6363  
Epoch: [1][47860/51233] Data 0.308 (0.310) Elapsed 425m 47s (remain 29m 59s) Loss: 0.0016(0.0572) Grad: 0.0190  
Epoch: [1][47880/51233] Data 0.317 (0.310) Elapsed 425m 58s (remain 29m 49s) Loss: 0.3218(0.0572) Grad: 1.0120  
Epoch: [1][47900/51233] Data 0.313 (0.310) Elapsed 426m 9s (remain 29m 38s) Loss: 0.0813(0.0572) Grad: 1.1323  
Epoch: [1][47920/51233] Data 0.316 (0.310) Elapsed 426m 19s (remain 29m 27s) Loss: 0.0594(0.0572) Grad: 0.9722  
Epoch: [1][47940/51233] Data 0.313 (0.310) Elapsed 426m 30s (remain 29m 17s) Loss: 0.0657(0.0572) Grad: 0.5736  
Epoch: [1][47960/51233] Data 0.311 (0.310) Elapsed 426m 41s (remain 29m 6s) Loss: 0.0204(0.0572) Grad: 0.3573  
Epoch: [1][47980/51233] Data 0.308 (0.310) Elapsed 426m 51s (remain 28m 55s) Loss: 0.0243(0.0572) Grad: 0.5282  
Epoch: [1][48000/51233] Data 0.311 (0.310) Elapsed 427m 2s (remain 28m 45s) Loss: 0.1082(0.0571) Grad: 1.5452  
Epoch: [1][48020/51233] Data 0.305 (0.310) Elapsed 427m 13s (remain 28m 34s) Loss: 0.0056(0.0571) Grad: 0.0571  
Epoch: [1][48040/51233] Data 0.317 (0.310) Elapsed 427m 23s (remain 28m 23s) Loss: 0.0127(0.0571) Grad: 0.2729  
Epoch: [1][48060/51233] Data 0.316 (0.310) Elapsed 427m 34s (remain 28m 13s) Loss: 0.0045(0.0571) Grad: 0.0711  
Epoch: [1][48080/51233] Data 0.308 (0.310) Elapsed 427m 45s (remain 28m 2s) Loss: 0.0052(0.0571) Grad: 0.1218  
Epoch: [1][48100/51233] Data 0.310 (0.310) Elapsed 427m 56s (remain 27m 51s) Loss: 0.0105(0.0571) Grad: 0.2224  
Epoch: [1][48120/51233] Data 0.305 (0.310) Elapsed 428m 6s (remain 27m 41s) Loss: 0.0064(0.0571) Grad: 0.0895  
Epoch: [1][48140/51233] Data 0.296 (0.310) Elapsed 428m 17s (remain 27m 30s) Loss: 0.0139(0.0571) Grad: 0.1999  
Epoch: [1][48160/51233] Data 0.308 (0.310) Elapsed 428m 28s (remain 27m 19s) Loss: 0.0042(0.0571) Grad: 0.0613  
Epoch: [1][48180/51233] Data 0.314 (0.310) Elapsed 428m 38s (remain 27m 9s) Loss: 0.0609(0.0571) Grad: 0.9768  
Epoch: [1][48200/51233] Data 0.313 (0.310) Elapsed 428m 49s (remain 26m 58s) Loss: 0.0666(0.0571) Grad: 0.8229  
Epoch: [1][48220/51233] Data 0.310 (0.310) Elapsed 429m 0s (remain 26m 47s) Loss: 0.0010(0.0571) Grad: 0.0218  
Epoch: [1][48240/51233] Data 0.317 (0.310) Elapsed 429m 10s (remain 26m 37s) Loss: 0.0483(0.0571) Grad: 0.7557  
Epoch: [1][48260/51233] Data 0.318 (0.310) Elapsed 429m 21s (remain 26m 26s) Loss: 0.0181(0.0571) Grad: 0.2280  
Epoch: [1][48280/51233] Data 0.309 (0.310) Elapsed 429m 32s (remain 26m 15s) Loss: 0.0004(0.0571) Grad: 0.0043  
Epoch: [1][48300/51233] Data 0.306 (0.310) Elapsed 429m 42s (remain 26m 5s) Loss: 0.0006(0.0571) Grad: 0.0090  
Epoch: [1][48320/51233] Data 0.318 (0.310) Elapsed 429m 53s (remain 25m 54s) Loss: 0.1311(0.0571) Grad: 1.7781  
Epoch: [1][48340/51233] Data 0.295 (0.310) Elapsed 430m 4s (remain 25m 43s) Loss: 0.0046(0.0570) Grad: 0.0502  
Epoch: [1][48360/51233] Data 0.313 (0.310) Elapsed 430m 14s (remain 25m 33s) Loss: 0.0012(0.0570) Grad: 0.0139  
Epoch: [1][48380/51233] Data 0.318 (0.310) Elapsed 430m 25s (remain 25m 22s) Loss: 0.0019(0.0570) Grad: 0.0203  
Epoch: [1][48400/51233] Data 0.313 (0.310) Elapsed 430m 36s (remain 25m 11s) Loss: 0.0403(0.0570) Grad: 0.7584  
Epoch: [1][48420/51233] Data 0.314 (0.310) Elapsed 430m 46s (remain 25m 1s) Loss: 0.0009(0.0570) Grad: 0.0107  
Epoch: [1][48440/51233] Data 0.309 (0.310) Elapsed 430m 57s (remain 24m 50s) Loss: 0.3094(0.0570) Grad: 2.0928  
Epoch: [1][48460/51233] Data 0.295 (0.310) Elapsed 431m 8s (remain 24m 39s) Loss: 0.0019(0.0570) Grad: 0.0305  
Epoch: [1][48480/51233] Data 0.313 (0.310) Elapsed 431m 19s (remain 24m 29s) Loss: 0.0282(0.0570) Grad: 0.6569  
Epoch: [1][48500/51233] Data 0.306 (0.310) Elapsed 431m 29s (remain 24m 18s) Loss: 0.0007(0.0570) Grad: 0.0099  
Epoch: [1][48520/51233] Data 0.309 (0.310) Elapsed 431m 40s (remain 24m 7s) Loss: 0.0058(0.0570) Grad: 0.0874  
Epoch: [1][48540/51233] Data 0.318 (0.310) Elapsed 431m 51s (remain 23m 56s) Loss: 0.0007(0.0570) Grad: 0.0087  
Epoch: [1][48560/51233] Data 0.317 (0.310) Elapsed 432m 1s (remain 23m 46s) Loss: 0.0023(0.0570) Grad: 0.0354  
Epoch: [1][48580/51233] Data 0.317 (0.310) Elapsed 432m 12s (remain 23m 35s) Loss: 0.0022(0.0570) Grad: 0.0273  
Epoch: [1][48600/51233] Data 0.317 (0.310) Elapsed 432m 23s (remain 23m 24s) Loss: 0.0351(0.0570) Grad: 0.5287  
Epoch: [1][48620/51233] Data 0.304 (0.310) Elapsed 432m 33s (remain 23m 14s) Loss: 0.0104(0.0570) Grad: 0.0929  
Epoch: [1][48640/51233] Data 0.317 (0.310) Elapsed 432m 44s (remain 23m 3s) Loss: 0.0069(0.0570) Grad: 0.1351  
Epoch: [1][48660/51233] Data 0.319 (0.310) Elapsed 432m 55s (remain 22m 52s) Loss: 0.1879(0.0570) Grad: 1.0675  
Epoch: [1][48680/51233] Data 0.316 (0.310) Elapsed 433m 5s (remain 22m 42s) Loss: 0.0105(0.0570) Grad: 0.2060  
Epoch: [1][48700/51233] Data 0.305 (0.310) Elapsed 433m 16s (remain 22m 31s) Loss: 0.0438(0.0570) Grad: 0.9407  
Epoch: [1][48720/51233] Data 0.308 (0.310) Elapsed 433m 27s (remain 22m 20s) Loss: 0.0016(0.0570) Grad: 0.0150  
Epoch: [1][48740/51233] Data 0.310 (0.310) Elapsed 433m 37s (remain 22m 10s) Loss: 0.0026(0.0569) Grad: 0.0507  
Epoch: [1][48760/51233] Data 0.295 (0.310) Elapsed 433m 48s (remain 21m 59s) Loss: 0.0020(0.0569) Grad: 0.0259  
Epoch: [1][48780/51233] Data 0.317 (0.310) Elapsed 433m 59s (remain 21m 48s) Loss: 0.0135(0.0570) Grad: 0.1622  
Epoch: [1][48800/51233] Data 0.319 (0.310) Elapsed 434m 9s (remain 21m 38s) Loss: 0.0041(0.0570) Grad: 0.0704  
Epoch: [1][48820/51233] Data 0.311 (0.310) Elapsed 434m 20s (remain 21m 27s) Loss: 0.0372(0.0570) Grad: 0.8608  
Epoch: [1][48840/51233] Data 0.318 (0.310) Elapsed 434m 31s (remain 21m 16s) Loss: 0.0091(0.0570) Grad: 0.1590  
Epoch: [1][48860/51233] Data 0.309 (0.310) Elapsed 434m 42s (remain 21m 6s) Loss: 0.0158(0.0570) Grad: 0.2020  
Epoch: [1][48880/51233] Data 0.307 (0.310) Elapsed 434m 52s (remain 20m 55s) Loss: 0.0620(0.0570) Grad: 1.0200  
Epoch: [1][48900/51233] Data 0.317 (0.310) Elapsed 435m 3s (remain 20m 44s) Loss: 0.0919(0.0569) Grad: 1.3926  
Epoch: [1][48920/51233] Data 0.318 (0.310) Elapsed 435m 14s (remain 20m 34s) Loss: 0.0441(0.0569) Grad: 0.5343  
Epoch: [1][48940/51233] Data 0.309 (0.310) Elapsed 435m 24s (remain 20m 23s) Loss: 0.0033(0.0569) Grad: 0.0996  
Epoch: [1][48960/51233] Data 0.318 (0.310) Elapsed 435m 35s (remain 20m 12s) Loss: 0.0201(0.0569) Grad: 0.3166  
Epoch: [1][48980/51233] Data 0.306 (0.310) Elapsed 435m 46s (remain 20m 2s) Loss: 0.0102(0.0569) Grad: 0.1831  
Epoch: [1][49000/51233] Data 0.318 (0.310) Elapsed 435m 56s (remain 19m 51s) Loss: 0.0380(0.0569) Grad: 0.5971  
Epoch: [1][49020/51233] Data 0.318 (0.310) Elapsed 436m 7s (remain 19m 40s) Loss: 0.0572(0.0569) Grad: 0.7520  
Epoch: [1][49040/51233] Data 0.317 (0.310) Elapsed 436m 18s (remain 19m 30s) Loss: 0.0007(0.0569) Grad: 0.0123  
Epoch: [1][49060/51233] Data 0.300 (0.310) Elapsed 436m 28s (remain 19m 19s) Loss: 0.1490(0.0569) Grad: 1.6209  
Epoch: [1][49080/51233] Data 0.317 (0.310) Elapsed 436m 39s (remain 19m 8s) Loss: 0.0380(0.0569) Grad: 0.6122  
Epoch: [1][49100/51233] Data 0.319 (0.310) Elapsed 436m 50s (remain 18m 58s) Loss: 0.0008(0.0569) Grad: 0.0090  
Epoch: [1][49120/51233] Data 0.308 (0.310) Elapsed 437m 0s (remain 18m 47s) Loss: 0.0703(0.0569) Grad: 0.9587  
Epoch: [1][49140/51233] Data 0.304 (0.310) Elapsed 437m 11s (remain 18m 36s) Loss: 0.0017(0.0569) Grad: 0.0185  
Epoch: [1][49160/51233] Data 0.317 (0.310) Elapsed 437m 22s (remain 18m 26s) Loss: 0.0365(0.0568) Grad: 0.5545  
Epoch: [1][49180/51233] Data 0.317 (0.310) Elapsed 437m 32s (remain 18m 15s) Loss: 0.0010(0.0568) Grad: 0.0108  
Epoch: [1][49200/51233] Data 0.308 (0.310) Elapsed 437m 43s (remain 18m 4s) Loss: 0.0091(0.0568) Grad: 0.2100  
Epoch: [1][49220/51233] Data 0.295 (0.310) Elapsed 437m 54s (remain 17m 54s) Loss: 0.1618(0.0568) Grad: 2.1881  
Epoch: [1][49240/51233] Data 0.312 (0.310) Elapsed 438m 5s (remain 17m 43s) Loss: 0.0066(0.0568) Grad: 0.2298  
Epoch: [1][49260/51233] Data 0.303 (0.310) Elapsed 438m 15s (remain 17m 32s) Loss: 0.0583(0.0568) Grad: 0.6244  
Epoch: [1][49280/51233] Data 0.314 (0.310) Elapsed 438m 26s (remain 17m 21s) Loss: 0.0036(0.0568) Grad: 0.0677  
Epoch: [1][49300/51233] Data 0.307 (0.310) Elapsed 438m 37s (remain 17m 11s) Loss: 0.0037(0.0568) Grad: 0.1112  
Epoch: [1][49320/51233] Data 0.309 (0.310) Elapsed 438m 47s (remain 17m 0s) Loss: 0.0487(0.0568) Grad: 0.8349  
Epoch: [1][49340/51233] Data 0.308 (0.310) Elapsed 438m 58s (remain 16m 49s) Loss: 0.0081(0.0568) Grad: 0.1661  
Epoch: [1][49360/51233] Data 0.317 (0.310) Elapsed 439m 9s (remain 16m 39s) Loss: 0.0238(0.0568) Grad: 0.3043  
Epoch: [1][49380/51233] Data 0.318 (0.310) Elapsed 439m 19s (remain 16m 28s) Loss: 0.0037(0.0568) Grad: 0.0652  
Epoch: [1][49400/51233] Data 0.318 (0.310) Elapsed 439m 30s (remain 16m 17s) Loss: 0.0027(0.0568) Grad: 0.0405  
Epoch: [1][49420/51233] Data 0.318 (0.310) Elapsed 439m 41s (remain 16m 7s) Loss: 0.0022(0.0568) Grad: 0.0210  
Epoch: [1][49440/51233] Data 0.300 (0.310) Elapsed 439m 51s (remain 15m 56s) Loss: 0.1024(0.0568) Grad: 1.6173  
Epoch: [1][49460/51233] Data 0.316 (0.310) Elapsed 440m 2s (remain 15m 45s) Loss: 0.1089(0.0568) Grad: 0.8866  
Epoch: [1][49480/51233] Data 0.308 (0.310) Elapsed 440m 13s (remain 15m 35s) Loss: 0.2732(0.0568) Grad: 1.7526  
Epoch: [1][49500/51233] Data 0.294 (0.310) Elapsed 440m 23s (remain 15m 24s) Loss: 0.0923(0.0568) Grad: 1.5389  
Epoch: [1][49520/51233] Data 0.306 (0.310) Elapsed 440m 34s (remain 15m 13s) Loss: 0.0015(0.0568) Grad: 0.0148  
Epoch: [1][49540/51233] Data 0.306 (0.310) Elapsed 440m 45s (remain 15m 3s) Loss: 0.0418(0.0568) Grad: 0.8529  
Epoch: [1][49560/51233] Data 0.304 (0.310) Elapsed 440m 56s (remain 14m 52s) Loss: 0.0971(0.0568) Grad: 1.4936  
Epoch: [1][49580/51233] Data 0.309 (0.310) Elapsed 441m 6s (remain 14m 41s) Loss: 0.0015(0.0568) Grad: 0.0173  
Epoch: [1][49600/51233] Data 0.317 (0.310) Elapsed 441m 17s (remain 14m 31s) Loss: 0.0078(0.0568) Grad: 0.1377  
Epoch: [1][49620/51233] Data 0.311 (0.310) Elapsed 441m 28s (remain 14m 20s) Loss: 0.0799(0.0568) Grad: 1.6956  
Epoch: [1][49640/51233] Data 0.315 (0.310) Elapsed 441m 38s (remain 14m 9s) Loss: 0.0022(0.0568) Grad: 0.0235  
Epoch: [1][49660/51233] Data 0.317 (0.310) Elapsed 441m 49s (remain 13m 59s) Loss: 0.1385(0.0568) Grad: 1.3812  
Epoch: [1][49680/51233] Data 0.311 (0.310) Elapsed 442m 0s (remain 13m 48s) Loss: 0.0094(0.0568) Grad: 0.1116  
Epoch: [1][49700/51233] Data 0.318 (0.310) Elapsed 442m 10s (remain 13m 37s) Loss: 0.0182(0.0568) Grad: 0.2198  
Epoch: [1][49720/51233] Data 0.308 (0.310) Elapsed 442m 21s (remain 13m 27s) Loss: 0.0027(0.0568) Grad: 0.0321  
Epoch: [1][49740/51233] Data 0.305 (0.310) Elapsed 442m 32s (remain 13m 16s) Loss: 0.1113(0.0568) Grad: 1.9329  
Epoch: [1][49760/51233] Data 0.309 (0.310) Elapsed 442m 42s (remain 13m 5s) Loss: 0.0197(0.0568) Grad: 0.4937  
Epoch: [1][49780/51233] Data 0.319 (0.310) Elapsed 442m 53s (remain 12m 55s) Loss: 0.0075(0.0568) Grad: 0.0698  
Epoch: [1][49800/51233] Data 0.302 (0.310) Elapsed 443m 4s (remain 12m 44s) Loss: 0.0044(0.0568) Grad: 0.0505  
Epoch: [1][49820/51233] Data 0.307 (0.310) Elapsed 443m 14s (remain 12m 33s) Loss: 0.1104(0.0568) Grad: 1.2670  
Epoch: [1][49840/51233] Data 0.318 (0.310) Elapsed 443m 25s (remain 12m 23s) Loss: 0.0143(0.0568) Grad: 0.1744  
Epoch: [1][49860/51233] Data 0.318 (0.310) Elapsed 443m 36s (remain 12m 12s) Loss: 0.0037(0.0568) Grad: 0.0292  
Epoch: [1][49880/51233] Data 0.300 (0.310) Elapsed 443m 46s (remain 12m 1s) Loss: 0.0122(0.0568) Grad: 0.1626  
Epoch: [1][49900/51233] Data 0.309 (0.310) Elapsed 443m 57s (remain 11m 51s) Loss: 0.0015(0.0568) Grad: 0.0127  
Epoch: [1][49920/51233] Data 0.313 (0.310) Elapsed 444m 8s (remain 11m 40s) Loss: 0.0130(0.0568) Grad: 0.1985  
Epoch: [1][49940/51233] Data 0.309 (0.310) Elapsed 444m 19s (remain 11m 29s) Loss: 0.0074(0.0568) Grad: 0.0816  
Epoch: [1][49960/51233] Data 0.317 (0.310) Elapsed 444m 29s (remain 11m 19s) Loss: 0.0167(0.0568) Grad: 0.2855  
Epoch: [1][49980/51233] Data 0.308 (0.310) Elapsed 444m 40s (remain 11m 8s) Loss: 0.0193(0.0568) Grad: 0.2904  
Epoch: [1][50000/51233] Data 0.308 (0.310) Elapsed 444m 51s (remain 10m 57s) Loss: 0.0087(0.0568) Grad: 0.1316  
Epoch: [1][50020/51233] Data 0.305 (0.310) Elapsed 445m 1s (remain 10m 46s) Loss: 0.0894(0.0568) Grad: 1.2343  
Epoch: [1][50040/51233] Data 0.312 (0.310) Elapsed 445m 12s (remain 10m 36s) Loss: 0.0102(0.0568) Grad: 0.2079  
Epoch: [1][50060/51233] Data 0.317 (0.310) Elapsed 445m 23s (remain 10m 25s) Loss: 0.0032(0.0568) Grad: 0.0564  
Epoch: [1][50080/51233] Data 0.308 (0.310) Elapsed 445m 33s (remain 10m 14s) Loss: 0.0515(0.0568) Grad: 0.9533  
Epoch: [1][50100/51233] Data 0.311 (0.310) Elapsed 445m 44s (remain 10m 4s) Loss: 0.0079(0.0568) Grad: 0.1258  
Epoch: [1][50120/51233] Data 0.317 (0.310) Elapsed 445m 55s (remain 9m 53s) Loss: 0.3652(0.0568) Grad: 2.2077  
Epoch: [1][50140/51233] Data 0.316 (0.310) Elapsed 446m 5s (remain 9m 42s) Loss: 0.0671(0.0568) Grad: 1.0306  
Epoch: [1][50160/51233] Data 0.298 (0.310) Elapsed 446m 16s (remain 9m 32s) Loss: 0.0060(0.0567) Grad: 0.0656  
Epoch: [1][50180/51233] Data 0.308 (0.310) Elapsed 446m 27s (remain 9m 21s) Loss: 0.0243(0.0568) Grad: 0.3016  
Epoch: [1][50200/51233] Data 0.307 (0.310) Elapsed 446m 37s (remain 9m 10s) Loss: 0.0218(0.0568) Grad: 0.2807  
Epoch: [1][50220/51233] Data 0.317 (0.310) Elapsed 446m 48s (remain 9m 0s) Loss: 0.0739(0.0568) Grad: 0.8046  
Epoch: [1][50240/51233] Data 0.317 (0.310) Elapsed 446m 59s (remain 8m 49s) Loss: 0.0096(0.0567) Grad: 0.1001  
Epoch: [1][50260/51233] Data 0.311 (0.310) Elapsed 447m 9s (remain 8m 38s) Loss: 0.0182(0.0567) Grad: 0.2324  
Epoch: [1][50280/51233] Data 0.310 (0.310) Elapsed 447m 20s (remain 8m 28s) Loss: 0.0034(0.0567) Grad: 0.0356  
Epoch: [1][50300/51233] Data 0.318 (0.310) Elapsed 447m 31s (remain 8m 17s) Loss: 0.0425(0.0567) Grad: 0.3546  
Epoch: [1][50320/51233] Data 0.310 (0.310) Elapsed 447m 42s (remain 8m 6s) Loss: 0.0373(0.0567) Grad: 0.4518  
Epoch: [1][50340/51233] Data 0.314 (0.310) Elapsed 447m 52s (remain 7m 56s) Loss: 0.0063(0.0567) Grad: 0.1113  
Epoch: [1][50360/51233] Data 0.303 (0.310) Elapsed 448m 3s (remain 7m 45s) Loss: 0.0238(0.0568) Grad: 0.5411  
Epoch: [1][50380/51233] Data 0.317 (0.310) Elapsed 448m 14s (remain 7m 34s) Loss: 0.0229(0.0567) Grad: 0.5935  
Epoch: [1][50400/51233] Data 0.308 (0.310) Elapsed 448m 24s (remain 7m 24s) Loss: 0.0056(0.0568) Grad: 0.0535  
Epoch: [1][50420/51233] Data 0.317 (0.310) Elapsed 448m 35s (remain 7m 13s) Loss: 0.0017(0.0567) Grad: 0.0168  
Epoch: [1][50440/51233] Data 0.309 (0.310) Elapsed 448m 46s (remain 7m 2s) Loss: 0.0474(0.0567) Grad: 0.8152  
Epoch: [1][50460/51233] Data 0.307 (0.310) Elapsed 448m 56s (remain 6m 52s) Loss: 0.0165(0.0567) Grad: 0.3318  
Epoch: [1][50480/51233] Data 0.309 (0.310) Elapsed 449m 7s (remain 6m 41s) Loss: 0.0021(0.0567) Grad: 0.0293  
Epoch: [1][50500/51233] Data 0.317 (0.310) Elapsed 449m 18s (remain 6m 30s) Loss: 0.0035(0.0567) Grad: 0.1373  
Epoch: [1][50520/51233] Data 0.308 (0.310) Elapsed 449m 28s (remain 6m 20s) Loss: 0.0006(0.0567) Grad: 0.0126  
Epoch: [1][50540/51233] Data 0.317 (0.310) Elapsed 449m 39s (remain 6m 9s) Loss: 0.0227(0.0567) Grad: 0.3749  
Epoch: [1][50560/51233] Data 0.317 (0.310) Elapsed 449m 50s (remain 5m 58s) Loss: 0.2974(0.0567) Grad: 1.9867  
Epoch: [1][50780/51233] Data 0.305 (0.310) Elapsed 451m 47s (remain 4m 1s) Loss: 0.0610(0.0566) Grad: 0.8677  
Epoch: [1][50800/51233] Data 0.315 (0.310) Elapsed 451m 58s (remain 3m 50s) Loss: 0.0199(0.0566) Grad: 0.3041  
Epoch: [1][50820/51233] Data 0.308 (0.310) Elapsed 452m 9s (remain 3m 39s) Loss: 0.0229(0.0566) Grad: 0.6531  
Epoch: [1][50840/51233] Data 0.309 (0.310) Elapsed 452m 19s (remain 3m 29s) Loss: 0.0591(0.0566) Grad: 0.9217  
Epoch: [1][50860/51233] Data 0.301 (0.310) Elapsed 452m 30s (remain 3m 18s) Loss: 0.0027(0.0566) Grad: 0.0465  
Epoch: [1][50880/51233] Data 0.301 (0.310) Elapsed 452m 41s (remain 3m 7s) Loss: 0.0192(0.0566) Grad: 0.3051  
Epoch: [1][50900/51233] Data 0.308 (0.310) Elapsed 452m 51s (remain 2m 57s) Loss: 0.0238(0.0566) Grad: 0.6553  
Epoch: [1][50920/51233] Data 0.310 (0.310) Elapsed 453m 2s (remain 2m 46s) Loss: 0.2512(0.0566) Grad: 2.5469  
Epoch: [1][50940/51233] Data 0.318 (0.310) Elapsed 453m 13s (remain 2m 35s) Loss: 0.0172(0.0566) Grad: 0.2799  
Epoch: [1][50960/51233] Data 0.317 (0.310) Elapsed 453m 24s (remain 2m 25s) Loss: 0.0027(0.0565) Grad: 0.0852  
Epoch: [1][50980/51233] Data 0.318 (0.310) Elapsed 453m 34s (remain 2m 14s) Loss: 0.0313(0.0565) Grad: 0.7539  
Epoch: [1][51000/51233] Data 0.318 (0.310) Elapsed 453m 45s (remain 2m 3s) Loss: 0.3127(0.0565) Grad: 1.9010  
Epoch: [1][51080/51233] Data 0.308 (0.310) Elapsed 454m 28s (remain 1m 21s) Loss: 0.0160(0.0565) Grad: 0.1568  
Epoch: [1][51100/51233] Data 0.295 (0.310) Elapsed 454m 38s (remain 1m 10s) Loss: 0.0333(0.0565) Grad: 0.7594  
Epoch: [1][51120/51233] Data 0.310 (0.310) Elapsed 454m 49s (remain 0m 59s) Loss: 0.0013(0.0565) Grad: 0.0160  
Epoch: [1][51140/51233] Data 0.308 (0.310) Elapsed 455m 0s (remain 0m 49s) Loss: 0.0360(0.0565) Grad: 0.9016  
Epoch: [1][51160/51233] Data 0.307 (0.310) Elapsed 455m 10s (remain 0m 38s) Loss: 0.0083(0.0565) Grad: 0.2403  
Epoch: [1][51180/51233] Data 0.309 (0.310) Elapsed 455m 21s (remain 0m 27s) Loss: 0.0309(0.0565) Grad: 0.6476  
Epoch: [1][51200/51233] Data 0.305 (0.310) Elapsed 455m 32s (remain 0m 17s) Loss: 0.0192(0.0565) Grad: 0.5389  
Epoch: [1][51220/51233] Data 0.299 (0.310) Elapsed 455m 42s (remain 0m 6s) Loss: 0.0175(0.0565) Grad: 0.3881  
Epoch: [1][51232/51233] Data 0.316 (0.310) Elapsed 455m 49s (remain 0m 0s) Loss: 0.0365(0.0565) Grad: 0.4296  
EVAL: [0/8358] Data 0.528 (0.528) Elapsed 0m 0s (remain 96m 0s) Loss: 0.0009(0.0009) 
EVAL: [20/8358] Data 0.001 (0.027) Elapsed 0m 3s (remain 22m 8s) Loss: 0.0000(0.0182) 
EVAL: [40/8358] Data 0.001 (0.014) Elapsed 0m 5s (remain 20m 15s) Loss: 0.0000(0.0096) 
EVAL: [60/8358] Data 0.001 (0.010) Elapsed 0m 8s (remain 19m 32s) Loss: 0.0007(0.0076) 
EVAL: [80/8358] Data 0.001 (0.008) Elapsed 0m 11s (remain 19m 6s) Loss: 0.0000(0.0171) 
EVAL: [100/8358] Data 0.001 (0.006) Elapsed 0m 13s (remain 18m 50s) Loss: 0.0000(0.0138) 
EVAL: [120/8358] Data 0.001 (0.006) Elapsed 0m 16s (remain 18m 38s) Loss: 0.0001(0.0116) 
EVAL: [140/8358] Data 0.001 (0.005) Elapsed 0m 19s (remain 18m 29s) Loss: 0.0000(0.0102) 
EVAL: [160/8358] Data 0.002 (0.004) Elapsed 0m 21s (remain 18m 21s) Loss: 0.0000(0.0090) 
EVAL: [180/8358] Data 0.001 (0.004) Elapsed 0m 24s (remain 18m 15s) Loss: 0.0000(0.0096) 
EVAL: [200/8358] Data 0.001 (0.004) Elapsed 0m 26s (remain 18m 9s) Loss: 0.0000(0.0089) 
EVAL: [220/8358] Data 0.001 (0.004) Elapsed 0m 29s (remain 18m 4s) Loss: 0.0000(0.0089) 
EVAL: [240/8358] Data 0.001 (0.003) Elapsed 0m 32s (remain 18m 0s) Loss: 0.0000(0.0083) 
EVAL: [260/8358] Data 0.001 (0.003) Elapsed 0m 34s (remain 17m 55s) Loss: 0.0000(0.0077) 
EVAL: [280/8358] Data 0.001 (0.003) Elapsed 0m 37s (remain 17m 51s) Loss: 0.0000(0.0073) 
EVAL: [300/8358] Data 0.001 (0.003) Elapsed 0m 39s (remain 17m 47s) Loss: 0.0001(0.0071) 
EVAL: [320/8358] Data 0.001 (0.003) Elapsed 0m 42s (remain 17m 43s) Loss: 0.0000(0.0066) 
EVAL: [340/8358] Data 0.001 (0.003) Elapsed 0m 45s (remain 17m 40s) Loss: 0.0000(0.0063) 
EVAL: [360/8358] Data 0.001 (0.003) Elapsed 0m 47s (remain 17m 36s) Loss: 0.0004(0.0071) 
EVAL: [380/8358] Data 0.001 (0.002) Elapsed 0m 50s (remain 17m 33s) Loss: 0.0000(0.0068) 
EVAL: [400/8358] Data 0.001 (0.002) Elapsed 0m 52s (remain 17m 29s) Loss: 0.0000(0.0073) 
EVAL: [420/8358] Data 0.002 (0.002) Elapsed 0m 55s (remain 17m 26s) Loss: 0.0000(0.0070) 
EVAL: [440/8358] Data 0.001 (0.002) Elapsed 0m 58s (remain 17m 23s) Loss: 0.0016(0.0068) 
EVAL: [460/8358] Data 0.001 (0.002) Elapsed 1m 0s (remain 17m 19s) Loss: 0.0016(0.0079) 
EVAL: [480/8358] Data 0.001 (0.002) Elapsed 1m 3s (remain 17m 16s) Loss: 0.0006(0.0077) 
EVAL: [500/8358] Data 0.001 (0.002) Elapsed 1m 5s (remain 17m 13s) Loss: 0.0000(0.0077) 
EVAL: [520/8358] Data 0.003 (0.002) Elapsed 1m 8s (remain 17m 10s) Loss: 0.0000(0.0077) 
EVAL: [540/8358] Data 0.002 (0.002) Elapsed 1m 11s (remain 17m 8s) Loss: 0.0004(0.0074) 
EVAL: [560/8358] Data 0.001 (0.002) Elapsed 1m 13s (remain 17m 5s) Loss: 0.0001(0.0073) 
EVAL: [580/8358] Data 0.001 (0.002) Elapsed 1m 16s (remain 17m 2s) Loss: 0.0097(0.0071) 
EVAL: [600/8358] Data 0.001 (0.002) Elapsed 1m 18s (remain 16m 59s) Loss: 0.0000(0.0069) 
EVAL: [620/8358] Data 0.001 (0.002) Elapsed 1m 21s (remain 16m 56s) Loss: 0.0004(0.0067) 
EVAL: [640/8358] Data 0.001 (0.002) Elapsed 1m 24s (remain 16m 53s) Loss: 0.0032(0.0067) 
EVAL: [660/8358] Data 0.002 (0.002) Elapsed 1m 26s (remain 16m 50s) Loss: 0.0000(0.0067) 
EVAL: [680/8358] Data 0.002 (0.002) Elapsed 1m 29s (remain 16m 47s) Loss: 0.0000(0.0070) 
EVAL: [700/8358] Data 0.001 (0.002) Elapsed 1m 32s (remain 16m 45s) Loss: 0.0000(0.0069) 
EVAL: [720/8358] Data 0.001 (0.002) Elapsed 1m 34s (remain 16m 42s) Loss: 0.0000(0.0068) 
EVAL: [740/8358] Data 0.001 (0.002) Elapsed 1m 37s (remain 16m 39s) Loss: 0.0000(0.0071) 
EVAL: [760/8358] Data 0.001 (0.002) Elapsed 1m 39s (remain 16m 36s) Loss: 0.0000(0.0069) 
EVAL: [780/8358] Data 0.001 (0.002) Elapsed 1m 42s (remain 16m 34s) Loss: 0.0001(0.0067) 
EVAL: [800/8358] Data 0.001 (0.002) Elapsed 1m 45s (remain 16m 31s) Loss: 0.0001(0.0066) 
EVAL: [820/8358] Data 0.001 (0.002) Elapsed 1m 47s (remain 16m 28s) Loss: 0.0000(0.0065) 
EVAL: [840/8358] Data 0.001 (0.002) Elapsed 1m 50s (remain 16m 25s) Loss: 0.0000(0.0064) 
EVAL: [860/8358] Data 0.001 (0.002) Elapsed 1m 52s (remain 16m 22s) Loss: 0.0000(0.0062) 
EVAL: [880/8358] Data 0.001 (0.002) Elapsed 1m 55s (remain 16m 20s) Loss: 0.0000(0.0061) 
EVAL: [900/8358] Data 0.001 (0.002) Elapsed 1m 58s (remain 16m 17s) Loss: 0.0000(0.0062) 
EVAL: [920/8358] Data 0.001 (0.002) Elapsed 2m 0s (remain 16m 14s) Loss: 0.2030(0.0063) 
EVAL: [940/8358] Data 0.001 (0.002) Elapsed 2m 3s (remain 16m 11s) Loss: 0.0000(0.0066) 
EVAL: [960/8358] Data 0.001 (0.002) Elapsed 2m 5s (remain 16m 8s) Loss: 0.0000(0.0065) 
EVAL: [980/8358] Data 0.001 (0.002) Elapsed 2m 8s (remain 16m 6s) Loss: 0.0003(0.0064) 
EVAL: [1000/8358] Data 0.001 (0.002) Elapsed 2m 11s (remain 16m 3s) Loss: 0.0000(0.0063) 
EVAL: [1020/8358] Data 0.001 (0.002) Elapsed 2m 13s (remain 16m 0s) Loss: 0.0000(0.0063) 
EVAL: [1040/8358] Data 0.001 (0.002) Elapsed 2m 16s (remain 15m 58s) Loss: 0.0000(0.0063) 
EVAL: [1060/8358] Data 0.001 (0.002) Elapsed 2m 18s (remain 15m 55s) Loss: 0.0000(0.0062) 
EVAL: [1080/8358] Data 0.001 (0.002) Elapsed 2m 21s (remain 15m 52s) Loss: 0.0042(0.0062) 
EVAL: [1100/8358] Data 0.001 (0.002) Elapsed 2m 24s (remain 15m 49s) Loss: 0.0003(0.0061) 
EVAL: [1120/8358] Data 0.001 (0.002) Elapsed 2m 26s (remain 15m 47s) Loss: 0.0001(0.0060) 
EVAL: [1140/8358] Data 0.001 (0.002) Elapsed 2m 29s (remain 15m 44s) Loss: 0.0000(0.0059) 
EVAL: [1160/8358] Data 0.002 (0.002) Elapsed 2m 31s (remain 15m 41s) Loss: 0.0000(0.0060) 
EVAL: [1180/8358] Data 0.001 (0.002) Elapsed 2m 34s (remain 15m 39s) Loss: 0.0000(0.0059) 
EVAL: [1200/8358] Data 0.001 (0.002) Elapsed 2m 37s (remain 15m 36s) Loss: 0.0000(0.0058) 
EVAL: [1220/8358] Data 0.001 (0.002) Elapsed 2m 39s (remain 15m 33s) Loss: 0.0000(0.0057) 
EVAL: [1240/8358] Data 0.001 (0.002) Elapsed 2m 42s (remain 15m 31s) Loss: 0.0000(0.0057) 
EVAL: [1260/8358] Data 0.004 (0.002) Elapsed 2m 44s (remain 15m 28s) Loss: 0.0002(0.0056) 
EVAL: [1280/8358] Data 0.001 (0.002) Elapsed 2m 47s (remain 15m 25s) Loss: 0.0000(0.0056) 
EVAL: [1300/8358] Data 0.001 (0.002) Elapsed 2m 50s (remain 15m 23s) Loss: 0.0000(0.0055) 
EVAL: [1320/8358] Data 0.001 (0.001) Elapsed 2m 52s (remain 15m 20s) Loss: 0.0000(0.0054) 
EVAL: [1340/8358] Data 0.001 (0.001) Elapsed 2m 55s (remain 15m 17s) Loss: 0.2715(0.0059) 
EVAL: [1360/8358] Data 0.001 (0.001) Elapsed 2m 57s (remain 15m 15s) Loss: 0.0000(0.0058) 
EVAL: [1380/8358] Data 0.001 (0.001) Elapsed 3m 0s (remain 15m 12s) Loss: 0.0000(0.0058) 
EVAL: [1400/8358] Data 0.001 (0.001) Elapsed 3m 3s (remain 15m 9s) Loss: 0.0000(0.0057) 
EVAL: [1420/8358] Data 0.001 (0.001) Elapsed 3m 5s (remain 15m 7s) Loss: 0.0095(0.0057) 
EVAL: [1440/8358] Data 0.001 (0.001) Elapsed 3m 8s (remain 15m 4s) Loss: 0.0000(0.0056) 
EVAL: [1460/8358] Data 0.001 (0.001) Elapsed 3m 11s (remain 15m 1s) Loss: 0.0000(0.0056) 
EVAL: [1480/8358] Data 0.001 (0.001) Elapsed 3m 13s (remain 14m 59s) Loss: 0.0000(0.0056) 
EVAL: [1500/8358] Data 0.001 (0.001) Elapsed 3m 16s (remain 14m 56s) Loss: 0.0000(0.0055) 
EVAL: [1520/8358] Data 0.001 (0.001) Elapsed 3m 18s (remain 14m 54s) Loss: 0.0125(0.0054) 
EVAL: [1540/8358] Data 0.001 (0.001) Elapsed 3m 21s (remain 14m 51s) Loss: 0.0000(0.0057) 
EVAL: [1560/8358] Data 0.001 (0.001) Elapsed 3m 24s (remain 14m 48s) Loss: 0.0001(0.0057) 
EVAL: [1580/8358] Data 0.001 (0.001) Elapsed 3m 26s (remain 14m 46s) Loss: 0.0001(0.0057) 
EVAL: [1600/8358] Data 0.001 (0.001) Elapsed 3m 29s (remain 14m 43s) Loss: 0.0034(0.0056) 
EVAL: [1620/8358] Data 0.001 (0.001) Elapsed 3m 32s (remain 14m 41s) Loss: 0.0000(0.0058) 
EVAL: [1640/8358] Data 0.001 (0.001) Elapsed 3m 34s (remain 14m 38s) Loss: 0.0001(0.0058) 
EVAL: [1660/8358] Data 0.001 (0.001) Elapsed 3m 37s (remain 14m 36s) Loss: 0.0000(0.0058) 
EVAL: [1680/8358] Data 0.001 (0.001) Elapsed 3m 39s (remain 14m 33s) Loss: 0.0015(0.0059) 
EVAL: [1700/8358] Data 0.001 (0.001) Elapsed 3m 42s (remain 14m 30s) Loss: 0.0000(0.0059) 
EVAL: [1720/8358] Data 0.001 (0.001) Elapsed 3m 45s (remain 14m 28s) Loss: 0.0003(0.0064) 
EVAL: [1740/8358] Data 0.001 (0.001) Elapsed 3m 47s (remain 14m 25s) Loss: 0.0049(0.0066) 
EVAL: [1760/8358] Data 0.001 (0.001) Elapsed 3m 50s (remain 14m 23s) Loss: 0.0001(0.0066) 
EVAL: [1780/8358] Data 0.001 (0.001) Elapsed 3m 53s (remain 14m 20s) Loss: 0.0001(0.0065) 
EVAL: [1800/8358] Data 0.001 (0.001) Elapsed 3m 55s (remain 14m 18s) Loss: 0.0025(0.0066) 
EVAL: [1820/8358] Data 0.001 (0.001) Elapsed 3m 58s (remain 14m 15s) Loss: 0.0000(0.0066) 
EVAL: [1840/8358] Data 0.001 (0.001) Elapsed 4m 0s (remain 14m 13s) Loss: 0.0000(0.0067) 
EVAL: [1860/8358] Data 0.001 (0.001) Elapsed 4m 3s (remain 14m 10s) Loss: 0.0000(0.0066) 
EVAL: [1880/8358] Data 0.001 (0.001) Elapsed 4m 6s (remain 14m 7s) Loss: 0.0022(0.0065) 
EVAL: [1900/8358] Data 0.001 (0.001) Elapsed 4m 8s (remain 14m 5s) Loss: 0.0000(0.0065) 
EVAL: [1920/8358] Data 0.003 (0.001) Elapsed 4m 11s (remain 14m 2s) Loss: 0.0083(0.0064) 
EVAL: [1940/8358] Data 0.001 (0.001) Elapsed 4m 14s (remain 14m 0s) Loss: 0.0000(0.0064) 
EVAL: [1960/8358] Data 0.001 (0.001) Elapsed 4m 16s (remain 13m 57s) Loss: 0.0000(0.0065) 
EVAL: [1980/8358] Data 0.005 (0.001) Elapsed 4m 19s (remain 13m 55s) Loss: 0.0046(0.0065) 
EVAL: [2000/8358] Data 0.001 (0.001) Elapsed 4m 22s (remain 13m 52s) Loss: 0.0001(0.0065) 
EVAL: [2020/8358] Data 0.001 (0.001) Elapsed 4m 24s (remain 13m 49s) Loss: 0.0000(0.0065) 
EVAL: [2040/8358] Data 0.001 (0.001) Elapsed 4m 27s (remain 13m 47s) Loss: 0.0000(0.0065) 
EVAL: [2060/8358] Data 0.001 (0.001) Elapsed 4m 29s (remain 13m 44s) Loss: 0.0000(0.0065) 
EVAL: [2080/8358] Data 0.001 (0.001) Elapsed 4m 32s (remain 13m 42s) Loss: 0.0000(0.0065) 
EVAL: [2100/8358] Data 0.002 (0.001) Elapsed 4m 35s (remain 13m 39s) Loss: 0.0001(0.0064) 
EVAL: [2120/8358] Data 0.001 (0.001) Elapsed 4m 37s (remain 13m 36s) Loss: 0.0001(0.0067) 
EVAL: [2140/8358] Data 0.001 (0.001) Elapsed 4m 40s (remain 13m 34s) Loss: 0.0000(0.0067) 
EVAL: [2160/8358] Data 0.001 (0.001) Elapsed 4m 43s (remain 13m 31s) Loss: 0.0001(0.0066) 
EVAL: [2180/8358] Data 0.001 (0.001) Elapsed 4m 45s (remain 13m 29s) Loss: 0.0000(0.0066) 
EVAL: [2200/8358] Data 0.001 (0.001) Elapsed 4m 48s (remain 13m 26s) Loss: 0.0000(0.0066) 
EVAL: [2220/8358] Data 0.001 (0.001) Elapsed 4m 50s (remain 13m 24s) Loss: 0.0420(0.0066) 
EVAL: [2240/8358] Data 0.001 (0.001) Elapsed 4m 53s (remain 13m 21s) Loss: 0.0020(0.0066) 
EVAL: [2260/8358] Data 0.001 (0.001) Elapsed 4m 56s (remain 13m 18s) Loss: 0.0001(0.0065) 
EVAL: [2280/8358] Data 0.001 (0.001) Elapsed 4m 58s (remain 13m 16s) Loss: 0.0002(0.0068) 
EVAL: [2300/8358] Data 0.001 (0.001) Elapsed 5m 1s (remain 13m 13s) Loss: 0.0151(0.0068) 
EVAL: [2320/8358] Data 0.001 (0.001) Elapsed 5m 4s (remain 13m 11s) Loss: 0.0000(0.0068) 
EVAL: [2340/8358] Data 0.001 (0.001) Elapsed 5m 6s (remain 13m 8s) Loss: 0.0011(0.0068) 
EVAL: [2360/8358] Data 0.001 (0.001) Elapsed 5m 9s (remain 13m 5s) Loss: 0.0000(0.0069) 
EVAL: [2380/8358] Data 0.001 (0.001) Elapsed 5m 12s (remain 13m 3s) Loss: 0.0000(0.0068) 
EVAL: [2400/8358] Data 0.001 (0.001) Elapsed 5m 14s (remain 13m 0s) Loss: 0.0000(0.0068) 
EVAL: [2420/8358] Data 0.001 (0.001) Elapsed 5m 17s (remain 12m 58s) Loss: 0.0002(0.0067) 
EVAL: [2440/8358] Data 0.001 (0.001) Elapsed 5m 19s (remain 12m 55s) Loss: 0.0200(0.0068) 
EVAL: [2460/8358] Data 0.001 (0.001) Elapsed 5m 22s (remain 12m 52s) Loss: 0.0000(0.0068) 
EVAL: [2480/8358] Data 0.001 (0.001) Elapsed 5m 25s (remain 12m 50s) Loss: 0.0000(0.0067) 
EVAL: [2500/8358] Data 0.001 (0.001) Elapsed 5m 27s (remain 12m 47s) Loss: 0.0000(0.0067) 
EVAL: [2520/8358] Data 0.001 (0.001) Elapsed 5m 30s (remain 12m 45s) Loss: 0.0006(0.0067) 
EVAL: [2540/8358] Data 0.001 (0.001) Elapsed 5m 33s (remain 12m 42s) Loss: 0.0000(0.0067) 
EVAL: [2560/8358] Data 0.001 (0.001) Elapsed 5m 35s (remain 12m 39s) Loss: 0.0000(0.0066) 
EVAL: [2580/8358] Data 0.001 (0.001) Elapsed 5m 38s (remain 12m 37s) Loss: 0.0031(0.0069) 
EVAL: [2600/8358] Data 0.001 (0.001) Elapsed 5m 40s (remain 12m 34s) Loss: 0.0000(0.0069) 
EVAL: [2620/8358] Data 0.001 (0.001) Elapsed 5m 43s (remain 12m 32s) Loss: 0.0000(0.0069) 
EVAL: [2640/8358] Data 0.001 (0.001) Elapsed 5m 46s (remain 12m 29s) Loss: 0.0000(0.0069) 
EVAL: [2660/8358] Data 0.001 (0.001) Elapsed 5m 48s (remain 12m 26s) Loss: 0.0000(0.0069) 
EVAL: [2680/8358] Data 0.001 (0.001) Elapsed 5m 51s (remain 12m 24s) Loss: 0.0001(0.0070) 
EVAL: [2700/8358] Data 0.001 (0.001) Elapsed 5m 54s (remain 12m 21s) Loss: 0.0008(0.0070) 
EVAL: [2720/8358] Data 0.001 (0.001) Elapsed 5m 56s (remain 12m 19s) Loss: 0.0000(0.0069) 
EVAL: [2740/8358] Data 0.001 (0.001) Elapsed 5m 59s (remain 12m 16s) Loss: 0.0030(0.0069) 
EVAL: [2760/8358] Data 0.001 (0.001) Elapsed 6m 2s (remain 12m 13s) Loss: 0.0000(0.0070) 
EVAL: [2780/8358] Data 0.001 (0.001) Elapsed 6m 4s (remain 12m 11s) Loss: 0.0000(0.0073) 
EVAL: [2800/8358] Data 0.001 (0.001) Elapsed 6m 7s (remain 12m 8s) Loss: 0.0000(0.0072) 
EVAL: [2820/8358] Data 0.001 (0.001) Elapsed 6m 9s (remain 12m 6s) Loss: 0.0000(0.0072) 
EVAL: [2840/8358] Data 0.001 (0.001) Elapsed 6m 12s (remain 12m 3s) Loss: 0.0000(0.0072) 
EVAL: [2860/8358] Data 0.001 (0.001) Elapsed 6m 15s (remain 12m 0s) Loss: 0.0000(0.0071) 
EVAL: [2880/8358] Data 0.001 (0.001) Elapsed 6m 17s (remain 11m 58s) Loss: 0.0005(0.0072) 
EVAL: [2900/8358] Data 0.001 (0.001) Elapsed 6m 20s (remain 11m 55s) Loss: 0.0011(0.0071) 
EVAL: [2920/8358] Data 0.001 (0.001) Elapsed 6m 23s (remain 11m 52s) Loss: 0.0001(0.0071) 
EVAL: [2940/8358] Data 0.002 (0.001) Elapsed 6m 25s (remain 11m 50s) Loss: 0.0004(0.0070) 
EVAL: [2960/8358] Data 0.003 (0.001) Elapsed 6m 28s (remain 11m 47s) Loss: 0.0001(0.0070) 
EVAL: [2980/8358] Data 0.002 (0.001) Elapsed 6m 30s (remain 11m 45s) Loss: 0.0003(0.0070) 
EVAL: [3000/8358] Data 0.001 (0.001) Elapsed 6m 33s (remain 11m 42s) Loss: 0.0000(0.0069) 
EVAL: [3020/8358] Data 0.001 (0.001) Elapsed 6m 36s (remain 11m 39s) Loss: 0.0000(0.0070) 
EVAL: [3040/8358] Data 0.001 (0.001) Elapsed 6m 38s (remain 11m 37s) Loss: 0.0000(0.0069) 
EVAL: [3060/8358] Data 0.001 (0.001) Elapsed 6m 41s (remain 11m 34s) Loss: 0.0000(0.0070) 
EVAL: [3080/8358] Data 0.002 (0.001) Elapsed 6m 44s (remain 11m 32s) Loss: 0.0000(0.0070) 
EVAL: [3100/8358] Data 0.001 (0.001) Elapsed 6m 46s (remain 11m 29s) Loss: 0.0003(0.0070) 
EVAL: [3120/8358] Data 0.001 (0.001) Elapsed 6m 49s (remain 11m 26s) Loss: 0.0002(0.0069) 
EVAL: [3140/8358] Data 0.001 (0.001) Elapsed 6m 51s (remain 11m 24s) Loss: 0.0000(0.0069) 
EVAL: [3160/8358] Data 0.001 (0.001) Elapsed 6m 54s (remain 11m 21s) Loss: 0.0001(0.0068) 
EVAL: [3180/8358] Data 0.001 (0.001) Elapsed 6m 57s (remain 11m 18s) Loss: 0.0069(0.0069) 
EVAL: [3200/8358] Data 0.001 (0.001) Elapsed 6m 59s (remain 11m 16s) Loss: 0.0000(0.0069) 
EVAL: [3220/8358] Data 0.001 (0.001) Elapsed 7m 2s (remain 11m 13s) Loss: 0.0000(0.0068) 
EVAL: [3240/8358] Data 0.001 (0.001) Elapsed 7m 5s (remain 11m 11s) Loss: 0.0000(0.0068) 
EVAL: [3260/8358] Data 0.001 (0.001) Elapsed 7m 7s (remain 11m 8s) Loss: 0.0019(0.0068) 
EVAL: [3280/8358] Data 0.001 (0.001) Elapsed 7m 10s (remain 11m 5s) Loss: 0.0000(0.0068) 
EVAL: [3300/8358] Data 0.001 (0.001) Elapsed 7m 12s (remain 11m 3s) Loss: 0.0000(0.0068) 
EVAL: [3320/8358] Data 0.001 (0.001) Elapsed 7m 15s (remain 11m 0s) Loss: 0.0000(0.0068) 
EVAL: [3340/8358] Data 0.002 (0.001) Elapsed 7m 18s (remain 10m 58s) Loss: 0.0001(0.0068) 
EVAL: [3360/8358] Data 0.001 (0.001) Elapsed 7m 20s (remain 10m 55s) Loss: 0.0022(0.0067) 
EVAL: [3380/8358] Data 0.001 (0.001) Elapsed 7m 23s (remain 10m 52s) Loss: 0.0000(0.0069) 
EVAL: [3400/8358] Data 0.001 (0.001) Elapsed 7m 26s (remain 10m 50s) Loss: 0.0002(0.0070) 
EVAL: [3420/8358] Data 0.001 (0.001) Elapsed 7m 28s (remain 10m 47s) Loss: 0.0000(0.0069) 
EVAL: [3440/8358] Data 0.001 (0.001) Elapsed 7m 31s (remain 10m 44s) Loss: 0.0064(0.0069) 
EVAL: [3460/8358] Data 0.001 (0.001) Elapsed 7m 34s (remain 10m 42s) Loss: 0.0000(0.0069) 
EVAL: [3480/8358] Data 0.001 (0.001) Elapsed 7m 36s (remain 10m 39s) Loss: 0.0000(0.0070) 
EVAL: [3500/8358] Data 0.001 (0.001) Elapsed 7m 39s (remain 10m 37s) Loss: 0.0010(0.0071) 
EVAL: [3520/8358] Data 0.001 (0.001) Elapsed 7m 41s (remain 10m 34s) Loss: 0.0001(0.0071) 
EVAL: [3540/8358] Data 0.001 (0.001) Elapsed 7m 44s (remain 10m 31s) Loss: 0.0000(0.0070) 
EVAL: [3560/8358] Data 0.001 (0.001) Elapsed 7m 47s (remain 10m 29s) Loss: 0.0001(0.0070) 
EVAL: [3580/8358] Data 0.001 (0.001) Elapsed 7m 49s (remain 10m 26s) Loss: 0.0000(0.0070) 
EVAL: [3600/8358] Data 0.001 (0.001) Elapsed 7m 52s (remain 10m 24s) Loss: 0.0001(0.0070) 
EVAL: [3620/8358] Data 0.001 (0.001) Elapsed 7m 55s (remain 10m 21s) Loss: 0.0000(0.0072) 
EVAL: [3640/8358] Data 0.001 (0.001) Elapsed 7m 57s (remain 10m 18s) Loss: 0.0140(0.0072) 
EVAL: [3660/8358] Data 0.001 (0.001) Elapsed 8m 0s (remain 10m 16s) Loss: 0.0001(0.0072) 
EVAL: [3680/8358] Data 0.001 (0.001) Elapsed 8m 2s (remain 10m 13s) Loss: 0.0000(0.0071) 
EVAL: [3700/8358] Data 0.001 (0.001) Elapsed 8m 5s (remain 10m 10s) Loss: 0.0033(0.0071) 
EVAL: [3720/8358] Data 0.001 (0.001) Elapsed 8m 8s (remain 10m 8s) Loss: 0.0000(0.0071) 
EVAL: [3740/8358] Data 0.001 (0.001) Elapsed 8m 10s (remain 10m 5s) Loss: 0.0000(0.0071) 
EVAL: [3760/8358] Data 0.001 (0.001) Elapsed 8m 13s (remain 10m 3s) Loss: 0.0002(0.0071) 
EVAL: [3780/8358] Data 0.001 (0.001) Elapsed 8m 16s (remain 10m 0s) Loss: 0.0000(0.0071) 
EVAL: [3800/8358] Data 0.001 (0.001) Elapsed 8m 18s (remain 9m 57s) Loss: 0.0000(0.0073) 
EVAL: [3820/8358] Data 0.001 (0.001) Elapsed 8m 21s (remain 9m 55s) Loss: 0.0000(0.0072) 
EVAL: [3840/8358] Data 0.001 (0.001) Elapsed 8m 23s (remain 9m 52s) Loss: 0.0000(0.0072) 
EVAL: [3860/8358] Data 0.001 (0.001) Elapsed 8m 26s (remain 9m 49s) Loss: 0.0111(0.0073) 
EVAL: [3880/8358] Data 0.001 (0.001) Elapsed 8m 29s (remain 9m 47s) Loss: 0.0000(0.0073) 
EVAL: [3900/8358] Data 0.001 (0.001) Elapsed 8m 31s (remain 9m 44s) Loss: 0.0013(0.0073) 
EVAL: [3920/8358] Data 0.003 (0.001) Elapsed 8m 34s (remain 9m 42s) Loss: 0.0000(0.0072) 
EVAL: [3940/8358] Data 0.001 (0.001) Elapsed 8m 37s (remain 9m 39s) Loss: 0.0000(0.0072) 
EVAL: [3960/8358] Data 0.001 (0.001) Elapsed 8m 39s (remain 9m 36s) Loss: 0.0131(0.0072) 
EVAL: [3980/8358] Data 0.001 (0.001) Elapsed 8m 42s (remain 9m 34s) Loss: 0.0006(0.0072) 
EVAL: [4000/8358] Data 0.001 (0.001) Elapsed 8m 44s (remain 9m 31s) Loss: 0.0000(0.0072) 
EVAL: [4020/8358] Data 0.001 (0.001) Elapsed 8m 47s (remain 9m 28s) Loss: 0.0002(0.0071) 
EVAL: [4040/8358] Data 0.001 (0.001) Elapsed 8m 50s (remain 9m 26s) Loss: 0.0004(0.0071) 
EVAL: [4060/8358] Data 0.001 (0.001) Elapsed 8m 52s (remain 9m 23s) Loss: 0.0000(0.0072) 
EVAL: [4080/8358] Data 0.001 (0.001) Elapsed 8m 55s (remain 9m 21s) Loss: 0.0000(0.0072) 
EVAL: [4100/8358] Data 0.001 (0.001) Elapsed 8m 58s (remain 9m 18s) Loss: 0.0000(0.0071) 
EVAL: [4120/8358] Data 0.001 (0.001) Elapsed 9m 0s (remain 9m 15s) Loss: 0.0000(0.0071) 
EVAL: [4140/8358] Data 0.001 (0.001) Elapsed 9m 3s (remain 9m 13s) Loss: 0.0018(0.0071) 
EVAL: [4160/8358] Data 0.001 (0.001) Elapsed 9m 5s (remain 9m 10s) Loss: 0.0000(0.0072) 
EVAL: [4180/8358] Data 0.001 (0.001) Elapsed 9m 8s (remain 9m 8s) Loss: 0.0014(0.0071) 
EVAL: [4200/8358] Data 0.001 (0.001) Elapsed 9m 11s (remain 9m 5s) Loss: 0.0000(0.0071) 
EVAL: [4220/8358] Data 0.001 (0.001) Elapsed 9m 13s (remain 9m 2s) Loss: 0.0002(0.0071) 
EVAL: [4240/8358] Data 0.001 (0.001) Elapsed 9m 16s (remain 9m 0s) Loss: 0.0001(0.0072) 
EVAL: [4260/8358] Data 0.001 (0.001) Elapsed 9m 19s (remain 8m 57s) Loss: 0.0022(0.0072) 
EVAL: [4280/8358] Data 0.002 (0.001) Elapsed 9m 21s (remain 8m 54s) Loss: 0.0006(0.0072) 
EVAL: [4300/8358] Data 0.001 (0.001) Elapsed 9m 24s (remain 8m 52s) Loss: 0.0007(0.0072) 
EVAL: [4320/8358] Data 0.001 (0.001) Elapsed 9m 26s (remain 8m 49s) Loss: 0.0000(0.0072) 
EVAL: [4340/8358] Data 0.001 (0.001) Elapsed 9m 29s (remain 8m 47s) Loss: 0.0000(0.0072) 
EVAL: [4360/8358] Data 0.001 (0.001) Elapsed 9m 32s (remain 8m 44s) Loss: 0.0001(0.0072) 
EVAL: [4380/8358] Data 0.001 (0.001) Elapsed 9m 34s (remain 8m 41s) Loss: 0.0001(0.0071) 
EVAL: [4400/8358] Data 0.003 (0.001) Elapsed 9m 37s (remain 8m 39s) Loss: 0.0000(0.0072) 
EVAL: [4420/8358] Data 0.001 (0.001) Elapsed 9m 40s (remain 8m 36s) Loss: 0.0570(0.0072) 
EVAL: [4440/8358] Data 0.001 (0.001) Elapsed 9m 42s (remain 8m 33s) Loss: 0.0000(0.0072) 
EVAL: [4460/8358] Data 0.001 (0.001) Elapsed 9m 45s (remain 8m 31s) Loss: 0.0000(0.0072) 
EVAL: [4480/8358] Data 0.001 (0.001) Elapsed 9m 47s (remain 8m 28s) Loss: 0.0001(0.0071) 
EVAL: [4500/8358] Data 0.001 (0.001) Elapsed 9m 50s (remain 8m 26s) Loss: 0.0002(0.0071) 
EVAL: [4520/8358] Data 0.003 (0.001) Elapsed 9m 53s (remain 8m 23s) Loss: 0.0004(0.0071) 
EVAL: [4540/8358] Data 0.001 (0.001) Elapsed 9m 55s (remain 8m 20s) Loss: 0.0001(0.0072) 
EVAL: [4560/8358] Data 0.002 (0.001) Elapsed 9m 58s (remain 8m 18s) Loss: 0.0037(0.0072) 
EVAL: [4580/8358] Data 0.001 (0.001) Elapsed 10m 1s (remain 8m 15s) Loss: 0.0000(0.0071) 
EVAL: [4600/8358] Data 0.001 (0.001) Elapsed 10m 3s (remain 8m 12s) Loss: 0.0000(0.0072) 
EVAL: [4620/8358] Data 0.001 (0.001) Elapsed 10m 6s (remain 8m 10s) Loss: 0.0000(0.0072) 
EVAL: [4640/8358] Data 0.002 (0.001) Elapsed 10m 8s (remain 8m 7s) Loss: 0.0006(0.0071) 
EVAL: [4660/8358] Data 0.001 (0.001) Elapsed 10m 11s (remain 8m 5s) Loss: 0.0000(0.0071) 
EVAL: [4680/8358] Data 0.001 (0.001) Elapsed 10m 14s (remain 8m 2s) Loss: 0.0000(0.0071) 
EVAL: [4700/8358] Data 0.001 (0.001) Elapsed 10m 16s (remain 7m 59s) Loss: 0.0005(0.0071) 
EVAL: [4720/8358] Data 0.001 (0.001) Elapsed 10m 19s (remain 7m 57s) Loss: 0.0000(0.0070) 
EVAL: [4740/8358] Data 0.002 (0.001) Elapsed 10m 22s (remain 7m 54s) Loss: 0.0055(0.0070) 
EVAL: [4760/8358] Data 0.001 (0.001) Elapsed 10m 24s (remain 7m 51s) Loss: 0.0000(0.0070) 
EVAL: [4780/8358] Data 0.001 (0.001) Elapsed 10m 27s (remain 7m 49s) Loss: 0.0000(0.0070) 
EVAL: [4800/8358] Data 0.001 (0.001) Elapsed 10m 29s (remain 7m 46s) Loss: 0.0000(0.0069) 
EVAL: [4820/8358] Data 0.001 (0.001) Elapsed 10m 32s (remain 7m 44s) Loss: 0.0013(0.0070) 
EVAL: [4840/8358] Data 0.001 (0.001) Elapsed 10m 35s (remain 7m 41s) Loss: 0.0001(0.0070) 
EVAL: [4860/8358] Data 0.001 (0.001) Elapsed 10m 37s (remain 7m 38s) Loss: 0.0000(0.0070) 
EVAL: [4880/8358] Data 0.004 (0.001) Elapsed 10m 40s (remain 7m 36s) Loss: 0.0000(0.0070) 
EVAL: [4900/8358] Data 0.001 (0.001) Elapsed 10m 43s (remain 7m 33s) Loss: 0.0000(0.0071) 
EVAL: [4920/8358] Data 0.001 (0.001) Elapsed 10m 45s (remain 7m 31s) Loss: 0.0008(0.0070) 
EVAL: [4940/8358] Data 0.001 (0.001) Elapsed 10m 48s (remain 7m 28s) Loss: 0.0000(0.0070) 
EVAL: [4960/8358] Data 0.001 (0.001) Elapsed 10m 50s (remain 7m 25s) Loss: 0.0000(0.0070) 
EVAL: [4980/8358] Data 0.001 (0.001) Elapsed 10m 53s (remain 7m 23s) Loss: 0.0000(0.0070) 
EVAL: [5000/8358] Data 0.001 (0.001) Elapsed 10m 56s (remain 7m 20s) Loss: 0.0241(0.0071) 
EVAL: [5020/8358] Data 0.001 (0.001) Elapsed 10m 58s (remain 7m 17s) Loss: 0.0000(0.0071) 
EVAL: [5040/8358] Data 0.001 (0.001) Elapsed 11m 1s (remain 7m 15s) Loss: 0.0000(0.0071) 
EVAL: [5060/8358] Data 0.001 (0.001) Elapsed 11m 4s (remain 7m 12s) Loss: 0.0005(0.0072) 
EVAL: [5080/8358] Data 0.001 (0.001) Elapsed 11m 6s (remain 7m 9s) Loss: 0.0000(0.0072) 
EVAL: [5100/8358] Data 0.001 (0.001) Elapsed 11m 9s (remain 7m 7s) Loss: 0.0000(0.0071) 
EVAL: [5120/8358] Data 0.004 (0.001) Elapsed 11m 11s (remain 7m 4s) Loss: 0.0000(0.0071) 
EVAL: [5140/8358] Data 0.001 (0.001) Elapsed 11m 14s (remain 7m 2s) Loss: 0.0000(0.0073) 
EVAL: [5160/8358] Data 0.001 (0.001) Elapsed 11m 17s (remain 6m 59s) Loss: 0.0000(0.0073) 
EVAL: [5180/8358] Data 0.001 (0.001) Elapsed 11m 19s (remain 6m 56s) Loss: 0.0001(0.0073) 
EVAL: [5200/8358] Data 0.001 (0.001) Elapsed 11m 22s (remain 6m 54s) Loss: 0.0000(0.0073) 
EVAL: [5220/8358] Data 0.001 (0.001) Elapsed 11m 25s (remain 6m 51s) Loss: 0.0000(0.0073) 
EVAL: [5240/8358] Data 0.001 (0.001) Elapsed 11m 27s (remain 6m 49s) Loss: 0.0000(0.0073) 
EVAL: [5260/8358] Data 0.001 (0.001) Elapsed 11m 30s (remain 6m 46s) Loss: 0.0000(0.0073) 
EVAL: [5280/8358] Data 0.001 (0.001) Elapsed 11m 32s (remain 6m 43s) Loss: 0.0000(0.0072) 
EVAL: [5300/8358] Data 0.001 (0.001) Elapsed 11m 35s (remain 6m 41s) Loss: 0.0000(0.0072) 
EVAL: [5320/8358] Data 0.001 (0.001) Elapsed 11m 38s (remain 6m 38s) Loss: 0.0000(0.0073) 
EVAL: [5340/8358] Data 0.001 (0.001) Elapsed 11m 40s (remain 6m 35s) Loss: 0.0031(0.0073) 
EVAL: [5360/8358] Data 0.001 (0.001) Elapsed 11m 43s (remain 6m 33s) Loss: 0.0000(0.0072) 
EVAL: [5380/8358] Data 0.002 (0.001) Elapsed 11m 46s (remain 6m 30s) Loss: 0.0001(0.0072) 
EVAL: [5400/8358] Data 0.001 (0.001) Elapsed 11m 48s (remain 6m 28s) Loss: 0.0001(0.0073) 
EVAL: [5420/8358] Data 0.001 (0.001) Elapsed 11m 51s (remain 6m 25s) Loss: 0.0092(0.0073) 
EVAL: [5440/8358] Data 0.001 (0.001) Elapsed 11m 53s (remain 6m 22s) Loss: 0.0001(0.0073) 
EVAL: [5460/8358] Data 0.001 (0.001) Elapsed 11m 56s (remain 6m 20s) Loss: 0.0000(0.0073) 
EVAL: [5480/8358] Data 0.001 (0.001) Elapsed 11m 59s (remain 6m 17s) Loss: 0.0000(0.0073) 
EVAL: [5500/8358] Data 0.001 (0.001) Elapsed 12m 1s (remain 6m 14s) Loss: 0.0000(0.0073) 
EVAL: [5520/8358] Data 0.001 (0.001) Elapsed 12m 4s (remain 6m 12s) Loss: 0.0000(0.0073) 
EVAL: [5540/8358] Data 0.001 (0.001) Elapsed 12m 7s (remain 6m 9s) Loss: 0.0000(0.0072) 
EVAL: [5560/8358] Data 0.001 (0.001) Elapsed 12m 9s (remain 6m 7s) Loss: 0.0348(0.0072) 
EVAL: [5580/8358] Data 0.001 (0.001) Elapsed 12m 12s (remain 6m 4s) Loss: 0.0000(0.0073) 
EVAL: [5600/8358] Data 0.001 (0.001) Elapsed 12m 14s (remain 6m 1s) Loss: 0.0000(0.0073) 
EVAL: [5620/8358] Data 0.001 (0.001) Elapsed 12m 17s (remain 5m 59s) Loss: 0.0000(0.0074) 
EVAL: [5640/8358] Data 0.001 (0.001) Elapsed 12m 20s (remain 5m 56s) Loss: 0.0079(0.0075) 
EVAL: [5660/8358] Data 0.001 (0.001) Elapsed 12m 22s (remain 5m 53s) Loss: 0.0000(0.0074) 
EVAL: [5680/8358] Data 0.001 (0.001) Elapsed 12m 25s (remain 5m 51s) Loss: 0.0000(0.0074) 
EVAL: [5700/8358] Data 0.001 (0.001) Elapsed 12m 28s (remain 5m 48s) Loss: 0.0000(0.0075) 
EVAL: [5720/8358] Data 0.001 (0.001) Elapsed 12m 30s (remain 5m 46s) Loss: 0.0002(0.0075) 
EVAL: [5740/8358] Data 0.001 (0.001) Elapsed 12m 33s (remain 5m 43s) Loss: 0.0000(0.0075) 
EVAL: [5760/8358] Data 0.001 (0.001) Elapsed 12m 35s (remain 5m 40s) Loss: 0.0007(0.0076) 
EVAL: [5780/8358] Data 0.001 (0.001) Elapsed 12m 38s (remain 5m 38s) Loss: 0.0000(0.0075) 
EVAL: [5800/8358] Data 0.001 (0.001) Elapsed 12m 41s (remain 5m 35s) Loss: 0.0000(0.0075) 
EVAL: [5820/8358] Data 0.001 (0.001) Elapsed 12m 43s (remain 5m 32s) Loss: 0.0003(0.0075) 
EVAL: [5840/8358] Data 0.001 (0.001) Elapsed 12m 46s (remain 5m 30s) Loss: 0.0000(0.0075) 
EVAL: [5860/8358] Data 0.001 (0.001) Elapsed 12m 49s (remain 5m 27s) Loss: 0.0000(0.0075) 
EVAL: [5880/8358] Data 0.001 (0.001) Elapsed 12m 51s (remain 5m 25s) Loss: 0.0000(0.0075) 
EVAL: [5900/8358] Data 0.001 (0.001) Elapsed 12m 54s (remain 5m 22s) Loss: 0.0004(0.0075) 
EVAL: [5920/8358] Data 0.001 (0.001) Elapsed 12m 56s (remain 5m 19s) Loss: 0.0006(0.0074) 
EVAL: [5940/8358] Data 0.001 (0.001) Elapsed 12m 59s (remain 5m 17s) Loss: 0.0000(0.0074) 
EVAL: [5960/8358] Data 0.001 (0.001) Elapsed 13m 2s (remain 5m 14s) Loss: 0.0026(0.0074) 
EVAL: [5980/8358] Data 0.001 (0.001) Elapsed 13m 4s (remain 5m 11s) Loss: 0.0001(0.0074) 
EVAL: [6000/8358] Data 0.001 (0.001) Elapsed 13m 7s (remain 5m 9s) Loss: 0.0000(0.0074) 
EVAL: [6020/8358] Data 0.001 (0.001) Elapsed 13m 10s (remain 5m 6s) Loss: 0.0000(0.0074) 
EVAL: [6040/8358] Data 0.001 (0.001) Elapsed 13m 12s (remain 5m 4s) Loss: 0.0000(0.0074) 
EVAL: [6060/8358] Data 0.001 (0.001) Elapsed 13m 15s (remain 5m 1s) Loss: 0.0000(0.0074) 
EVAL: [6080/8358] Data 0.001 (0.001) Elapsed 13m 17s (remain 4m 58s) Loss: 0.0000(0.0073) 
EVAL: [6100/8358] Data 0.001 (0.001) Elapsed 13m 20s (remain 4m 56s) Loss: 0.0000(0.0074) 
EVAL: [6120/8358] Data 0.001 (0.001) Elapsed 13m 23s (remain 4m 53s) Loss: 0.0004(0.0073) 
EVAL: [6140/8358] Data 0.001 (0.001) Elapsed 13m 25s (remain 4m 50s) Loss: 0.0006(0.0073) 
EVAL: [6160/8358] Data 0.002 (0.001) Elapsed 13m 28s (remain 4m 48s) Loss: 0.0000(0.0073) 
EVAL: [6180/8358] Data 0.001 (0.001) Elapsed 13m 31s (remain 4m 45s) Loss: 0.0001(0.0073) 
EVAL: [6200/8358] Data 0.001 (0.001) Elapsed 13m 33s (remain 4m 43s) Loss: 0.0000(0.0074) 
EVAL: [6220/8358] Data 0.001 (0.001) Elapsed 13m 36s (remain 4m 40s) Loss: 0.0000(0.0074) 
EVAL: [6240/8358] Data 0.001 (0.001) Elapsed 13m 38s (remain 4m 37s) Loss: 0.0385(0.0075) 
EVAL: [6260/8358] Data 0.002 (0.001) Elapsed 13m 41s (remain 4m 35s) Loss: 0.0001(0.0074) 
EVAL: [6280/8358] Data 0.001 (0.001) Elapsed 13m 44s (remain 4m 32s) Loss: 0.0001(0.0075) 
EVAL: [6300/8358] Data 0.001 (0.001) Elapsed 13m 46s (remain 4m 29s) Loss: 0.0094(0.0075) 
EVAL: [6320/8358] Data 0.001 (0.001) Elapsed 13m 49s (remain 4m 27s) Loss: 0.0004(0.0075) 
EVAL: [6340/8358] Data 0.002 (0.001) Elapsed 13m 52s (remain 4m 24s) Loss: 0.0000(0.0075) 
EVAL: [6360/8358] Data 0.001 (0.001) Elapsed 13m 54s (remain 4m 22s) Loss: 0.0000(0.0075) 
EVAL: [6380/8358] Data 0.001 (0.001) Elapsed 13m 57s (remain 4m 19s) Loss: 0.0004(0.0075) 
EVAL: [6400/8358] Data 0.001 (0.001) Elapsed 14m 0s (remain 4m 16s) Loss: 0.0000(0.0076) 
EVAL: [6420/8358] Data 0.001 (0.001) Elapsed 14m 2s (remain 4m 14s) Loss: 0.0002(0.0076) 
EVAL: [6440/8358] Data 0.001 (0.001) Elapsed 14m 5s (remain 4m 11s) Loss: 0.0043(0.0076) 
EVAL: [6460/8358] Data 0.001 (0.001) Elapsed 14m 7s (remain 4m 8s) Loss: 0.0001(0.0077) 
EVAL: [6480/8358] Data 0.001 (0.001) Elapsed 14m 10s (remain 4m 6s) Loss: 0.0001(0.0077) 
EVAL: [6500/8358] Data 0.001 (0.001) Elapsed 14m 13s (remain 4m 3s) Loss: 0.0000(0.0077) 
EVAL: [6520/8358] Data 0.001 (0.001) Elapsed 14m 15s (remain 4m 1s) Loss: 0.0001(0.0077) 
EVAL: [6540/8358] Data 0.001 (0.001) Elapsed 14m 18s (remain 3m 58s) Loss: 0.0001(0.0078) 
EVAL: [6560/8358] Data 0.001 (0.001) Elapsed 14m 21s (remain 3m 55s) Loss: 0.0001(0.0077) 
EVAL: [6580/8358] Data 0.004 (0.001) Elapsed 14m 23s (remain 3m 53s) Loss: 0.0011(0.0077) 
EVAL: [6600/8358] Data 0.001 (0.001) Elapsed 14m 26s (remain 3m 50s) Loss: 0.0011(0.0078) 
EVAL: [6620/8358] Data 0.001 (0.001) Elapsed 14m 28s (remain 3m 47s) Loss: 0.0001(0.0079) 
EVAL: [6640/8358] Data 0.001 (0.001) Elapsed 14m 31s (remain 3m 45s) Loss: 0.0000(0.0079) 
EVAL: [6660/8358] Data 0.001 (0.001) Elapsed 14m 34s (remain 3m 42s) Loss: 0.0000(0.0079) 
EVAL: [6680/8358] Data 0.001 (0.001) Elapsed 14m 36s (remain 3m 40s) Loss: 0.0000(0.0079) 
EVAL: [6700/8358] Data 0.001 (0.001) Elapsed 14m 39s (remain 3m 37s) Loss: 0.0001(0.0079) 
EVAL: [6720/8358] Data 0.002 (0.001) Elapsed 14m 42s (remain 3m 34s) Loss: 0.0009(0.0079) 
EVAL: [6740/8358] Data 0.001 (0.001) Elapsed 14m 44s (remain 3m 32s) Loss: 0.0327(0.0079) 
EVAL: [6760/8358] Data 0.001 (0.001) Elapsed 14m 47s (remain 3m 29s) Loss: 0.0000(0.0081) 
EVAL: [6780/8358] Data 0.002 (0.001) Elapsed 14m 49s (remain 3m 26s) Loss: 0.0002(0.0080) 
EVAL: [6800/8358] Data 0.001 (0.001) Elapsed 14m 52s (remain 3m 24s) Loss: 0.0000(0.0081) 
EVAL: [6820/8358] Data 0.001 (0.001) Elapsed 14m 55s (remain 3m 21s) Loss: 0.0000(0.0081) 
EVAL: [6840/8358] Data 0.001 (0.001) Elapsed 14m 57s (remain 3m 19s) Loss: 0.0000(0.0081) 
EVAL: [6860/8358] Data 0.002 (0.001) Elapsed 15m 0s (remain 3m 16s) Loss: 0.0004(0.0081) 
EVAL: [6880/8358] Data 0.001 (0.001) Elapsed 15m 3s (remain 3m 13s) Loss: 0.1847(0.0081) 
EVAL: [6900/8358] Data 0.001 (0.001) Elapsed 15m 5s (remain 3m 11s) Loss: 0.0001(0.0081) 
EVAL: [6920/8358] Data 0.001 (0.001) Elapsed 15m 8s (remain 3m 8s) Loss: 0.0004(0.0082) 
EVAL: [6940/8358] Data 0.001 (0.001) Elapsed 15m 10s (remain 3m 5s) Loss: 0.3826(0.0083) 
EVAL: [6960/8358] Data 0.001 (0.001) Elapsed 15m 13s (remain 3m 3s) Loss: 0.0000(0.0083) 
EVAL: [6980/8358] Data 0.001 (0.001) Elapsed 15m 16s (remain 3m 0s) Loss: 0.0030(0.0084) 
EVAL: [7000/8358] Data 0.001 (0.001) Elapsed 15m 18s (remain 2m 58s) Loss: 0.0000(0.0084) 
EVAL: [7020/8358] Data 0.004 (0.001) Elapsed 15m 21s (remain 2m 55s) Loss: 0.0264(0.0084) 
EVAL: [7040/8358] Data 0.001 (0.001) Elapsed 15m 24s (remain 2m 52s) Loss: 0.0002(0.0084) 
EVAL: [7060/8358] Data 0.004 (0.001) Elapsed 15m 26s (remain 2m 50s) Loss: 0.0003(0.0084) 
EVAL: [7080/8358] Data 0.001 (0.001) Elapsed 15m 29s (remain 2m 47s) Loss: 0.0000(0.0085) 
EVAL: [7100/8358] Data 0.003 (0.001) Elapsed 15m 32s (remain 2m 44s) Loss: 0.0000(0.0086) 
EVAL: [7120/8358] Data 0.001 (0.001) Elapsed 15m 34s (remain 2m 42s) Loss: 0.0006(0.0086) 
EVAL: [7140/8358] Data 0.001 (0.001) Elapsed 15m 37s (remain 2m 39s) Loss: 0.0015(0.0086) 
EVAL: [7160/8358] Data 0.001 (0.001) Elapsed 15m 39s (remain 2m 37s) Loss: 0.0000(0.0085) 
EVAL: [7180/8358] Data 0.001 (0.001) Elapsed 15m 42s (remain 2m 34s) Loss: 0.0000(0.0085) 
EVAL: [7200/8358] Data 0.001 (0.001) Elapsed 15m 45s (remain 2m 31s) Loss: 0.0000(0.0086) 
EVAL: [7220/8358] Data 0.002 (0.001) Elapsed 15m 47s (remain 2m 29s) Loss: 0.0036(0.0086) 
EVAL: [7240/8358] Data 0.001 (0.001) Elapsed 15m 50s (remain 2m 26s) Loss: 0.0000(0.0087) 
EVAL: [7260/8358] Data 0.001 (0.001) Elapsed 15m 53s (remain 2m 23s) Loss: 0.0000(0.0087) 
EVAL: [7280/8358] Data 0.001 (0.001) Elapsed 15m 55s (remain 2m 21s) Loss: 0.0002(0.0088) 
EVAL: [7300/8358] Data 0.001 (0.001) Elapsed 15m 58s (remain 2m 18s) Loss: 0.0115(0.0088) 
EVAL: [7320/8358] Data 0.001 (0.001) Elapsed 16m 0s (remain 2m 16s) Loss: 0.0006(0.0088) 
EVAL: [7340/8358] Data 0.001 (0.001) Elapsed 16m 3s (remain 2m 13s) Loss: 0.0000(0.0088) 
EVAL: [7360/8358] Data 0.001 (0.001) Elapsed 16m 6s (remain 2m 10s) Loss: 0.0000(0.0088) 
EVAL: [7380/8358] Data 0.001 (0.001) Elapsed 16m 8s (remain 2m 8s) Loss: 0.0037(0.0088) 
EVAL: [7400/8358] Data 0.001 (0.001) Elapsed 16m 11s (remain 2m 5s) Loss: 0.0004(0.0088) 
EVAL: [7420/8358] Data 0.001 (0.001) Elapsed 16m 14s (remain 2m 2s) Loss: 0.0000(0.0089) 
EVAL: [7440/8358] Data 0.001 (0.001) Elapsed 16m 16s (remain 2m 0s) Loss: 0.0000(0.0089) 
EVAL: [7460/8358] Data 0.001 (0.001) Elapsed 16m 19s (remain 1m 57s) Loss: 0.0001(0.0088) 
EVAL: [7480/8358] Data 0.002 (0.001) Elapsed 16m 21s (remain 1m 55s) Loss: 0.0048(0.0088) 
EVAL: [7500/8358] Data 0.001 (0.001) Elapsed 16m 24s (remain 1m 52s) Loss: 0.0000(0.0088) 
EVAL: [7520/8358] Data 0.001 (0.001) Elapsed 16m 27s (remain 1m 49s) Loss: 0.0368(0.0089) 
EVAL: [7540/8358] Data 0.001 (0.001) Elapsed 16m 29s (remain 1m 47s) Loss: 0.0000(0.0089) 
EVAL: [7560/8358] Data 0.001 (0.001) Elapsed 16m 32s (remain 1m 44s) Loss: 0.0001(0.0088) 
EVAL: [7580/8358] Data 0.001 (0.001) Elapsed 16m 35s (remain 1m 41s) Loss: 0.0000(0.0088) 
EVAL: [7600/8358] Data 0.001 (0.001) Elapsed 16m 37s (remain 1m 39s) Loss: 0.0000(0.0089) 
EVAL: [7620/8358] Data 0.001 (0.001) Elapsed 16m 40s (remain 1m 36s) Loss: 0.0000(0.0089) 
EVAL: [7640/8358] Data 0.002 (0.001) Elapsed 16m 43s (remain 1m 34s) Loss: 0.0000(0.0090) 
EVAL: [7660/8358] Data 0.001 (0.001) Elapsed 16m 45s (remain 1m 31s) Loss: 0.3677(0.0090) 
EVAL: [7680/8358] Data 0.001 (0.001) Elapsed 16m 48s (remain 1m 28s) Loss: 0.0005(0.0090) 
EVAL: [7700/8358] Data 0.001 (0.001) Elapsed 16m 50s (remain 1m 26s) Loss: 0.0034(0.0090) 
EVAL: [7720/8358] Data 0.001 (0.001) Elapsed 16m 53s (remain 1m 23s) Loss: 0.0000(0.0091) 
EVAL: [7740/8358] Data 0.002 (0.001) Elapsed 16m 56s (remain 1m 20s) Loss: 0.0000(0.0092) 
EVAL: [7760/8358] Data 0.002 (0.001) Elapsed 16m 58s (remain 1m 18s) Loss: 0.0035(0.0093) 
EVAL: [7780/8358] Data 0.002 (0.001) Elapsed 17m 1s (remain 1m 15s) Loss: 0.0000(0.0094) 
EVAL: [7800/8358] Data 0.001 (0.001) Elapsed 17m 4s (remain 1m 13s) Loss: 0.0000(0.0095) 
EVAL: [7820/8358] Data 0.001 (0.001) Elapsed 17m 6s (remain 1m 10s) Loss: 0.0000(0.0095) 
EVAL: [7840/8358] Data 0.001 (0.001) Elapsed 17m 9s (remain 1m 7s) Loss: 0.0001(0.0095) 
EVAL: [7860/8358] Data 0.001 (0.001) Elapsed 17m 12s (remain 1m 5s) Loss: 0.4620(0.0095) 
EVAL: [7880/8358] Data 0.001 (0.001) Elapsed 17m 14s (remain 1m 2s) Loss: 0.0001(0.0095) 
EVAL: [7900/8358] Data 0.001 (0.001) Elapsed 17m 17s (remain 0m 59s) Loss: 0.0002(0.0095) 
EVAL: [7920/8358] Data 0.001 (0.001) Elapsed 17m 19s (remain 0m 57s) Loss: 0.0000(0.0095) 
EVAL: [7940/8358] Data 0.001 (0.001) Elapsed 17m 22s (remain 0m 54s) Loss: 0.0000(0.0095) 
EVAL: [7960/8358] Data 0.001 (0.001) Elapsed 17m 25s (remain 0m 52s) Loss: 0.0009(0.0095) 
EVAL: [7980/8358] Data 0.001 (0.001) Elapsed 17m 27s (remain 0m 49s) Loss: 0.0002(0.0095) 
EVAL: [8000/8358] Data 0.001 (0.001) Elapsed 17m 30s (remain 0m 46s) Loss: 0.0011(0.0096) 
EVAL: [8020/8358] Data 0.001 (0.001) Elapsed 17m 33s (remain 0m 44s) Loss: 0.0000(0.0096) 
EVAL: [8040/8358] Data 0.001 (0.001) Elapsed 17m 35s (remain 0m 41s) Loss: 0.1263(0.0096) 
EVAL: [8060/8358] Data 0.001 (0.001) Elapsed 17m 38s (remain 0m 38s) Loss: 0.0000(0.0096) 
EVAL: [8080/8358] Data 0.001 (0.001) Elapsed 17m 40s (remain 0m 36s) Loss: 0.3314(0.0096) 
EVAL: [8100/8358] Data 0.001 (0.001) Elapsed 17m 43s (remain 0m 33s) Loss: 0.0078(0.0097) 
EVAL: [8120/8358] Data 0.001 (0.001) Elapsed 17m 46s (remain 0m 31s) Loss: 0.0001(0.0098) 
EVAL: [8140/8358] Data 0.001 (0.001) Elapsed 17m 48s (remain 0m 28s) Loss: 0.0000(0.0099) 
EVAL: [8160/8358] Data 0.001 (0.001) Elapsed 17m 51s (remain 0m 25s) Loss: 0.0000(0.0099) 
EVAL: [8180/8358] Data 0.001 (0.001) Elapsed 17m 54s (remain 0m 23s) Loss: 0.0000(0.0099) 
EVAL: [8200/8358] Data 0.001 (0.001) Elapsed 17m 56s (remain 0m 20s) Loss: 0.0000(0.0099) 
EVAL: [8220/8358] Data 0.001 (0.001) Elapsed 17m 59s (remain 0m 17s) Loss: 0.0013(0.0100) 
EVAL: [8240/8358] Data 0.001 (0.001) Elapsed 18m 1s (remain 0m 15s) Loss: 0.0000(0.0100) 
EVAL: [8260/8358] Data 0.002 (0.001) Elapsed 18m 4s (remain 0m 12s) Loss: 0.0000(0.0100) 
EVAL: [8280/8358] Data 0.001 (0.001) Elapsed 18m 7s (remain 0m 10s) Loss: 0.0000(0.0100) 
EVAL: [8300/8358] Data 0.001 (0.001) Elapsed 18m 9s (remain 0m 7s) Loss: 0.0046(0.0101) 
EVAL: [8320/8358] Data 0.001 (0.001) Elapsed 18m 12s (remain 0m 4s) Loss: 0.0018(0.0101) 
EVAL: [8340/8358] Data 0.002 (0.001) Elapsed 18m 15s (remain 0m 2s) Loss: 0.0129(0.0101) 
EVAL: [8357/8358] Data 0.000 (0.001) Elapsed 18m 17s (remain 0m 0s) Loss: 0.0000(0.0102) 
Epoch 1 - avg_train_loss: 0.0565  avg_val_loss: 0.0102  time: 28447s
Epoch 1 - LogLoss: 0.020480039821626713 - AUC: 0.9999018643768134
Epoch 1 - Save Best Score: 0.0205 Model
0.020480039821626713
Epoch: [2][0/51233] Data 1.773 (1.773) Elapsed 0m 2s (remain 1715m 30s) Loss: 0.1368(0.1368) Grad: 1.1222  
Epoch: [2][20/51233] Data 0.306 (0.366) Elapsed 0m 12s (remain 517m 4s) Loss: 0.1217(0.0384) Grad: 1.5557  
Epoch: [2][40/51233] Data 0.299 (0.337) Elapsed 0m 23s (remain 487m 15s) Loss: 0.0098(0.0453) Grad: 0.1396  
Epoch: [2][60/51233] Data 0.303 (0.327) Elapsed 0m 34s (remain 476m 59s) Loss: 0.0173(0.0394) Grad: 0.3130  
Epoch: [2][80/51233] Data 0.317 (0.323) Elapsed 0m 44s (remain 471m 35s) Loss: 0.0411(0.0464) Grad: 0.4037  
Epoch: [2][100/51233] Data 0.306 (0.319) Elapsed 0m 55s (remain 468m 22s) Loss: 0.2145(0.0436) Grad: 2.8110  
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[45], line 2
      1 if __name__ == '__main__':                                                                                              
----> 2     main()

Cell In[44], line 18, in main()
     16 for fold in range(CFG.n_fold):
     17     if fold in CFG.trn_fold:
---> 18         train_loop(folds, fold)
     19         #oof_df = pd.concat([oof_df, _oof_df])
     20         #LOGGER.info(f"========== fold: {fold} result ==========")
     21         #get_result(_oof_df)
     22 # CV result
     23 LOGGER.info(f"========== CV ==========")

Cell In[43], line 76, in train_loop(folds, fold)
     73 start_time = time.time()
     75 # train
---> 76 avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)
     78 # eval
     79 avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)

Cell In[42], line 96, in train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)
     79             print('Epoch: [{0}][{1}/{2}] '
     80                   'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
     81                   'Elapsed {remain:s} '
   (...)
     90                    #lr=scheduler.get_lr()[0],
     91                    ))
     93 #     # Log epoch summary to wandb
     94 #     wandb.log({"Epoch Training Loss": losses.avg, "Epoch": epoch})
---> 96         wandb.log({
     97     "Train Loss": losses.val,
     98     "Step": step,
     99     "Gradient Norm": grad_norm,
    100     "Learning Rate": optimizer.param_groups[0]['lr']  # Add this line to log the learning rate
    101 })
    102     return losses.avg

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:449, in _run_decorator._noop.<locals>.wrapper(self, *args, **kwargs)
    446         wandb.termwarn(message, repeat=False)
    447         return cls.Dummy()
--> 449 return func(self, *args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:400, in _run_decorator._noop_on_finish.<locals>.decorator_fn.<locals>.wrapper_fn(self, *args, **kwargs)
    397 @functools.wraps(func)
    398 def wrapper_fn(self: Type["Run"], *args: Any, **kwargs: Any) -> Any:
    399     if not getattr(self, "_is_finished", False):
--> 400         return func(self, *args, **kwargs)
    402     default_message = (
    403         f"Run ({self.id}) is finished. The call to `{func.__name__}` will be ignored. "
    404         f"Please make sure that you are using an active run."
    405     )
    406     resolved_message = message or default_message

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:390, in _run_decorator._attach.<locals>.wrapper(self, *args, **kwargs)
    388         raise e
    389     cls._is_attaching = ""
--> 390 return func(self, *args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:1877, in Run.log(self, data, step, commit, sync)
   1870 if self._settings._shared and step is not None:
   1871     wandb.termwarn(
   1872         "In shared mode, the use of `wandb.log` with the step argument is not supported "
   1873         f"and will be ignored. Please refer to {wburls.get('wandb_define_metric')} "
   1874         "on how to customize your x-axis.",
   1875         repeat=False,
   1876     )
-> 1877 self._log(data=data, step=step, commit=commit)

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:1641, in Run._log(self, data, step, commit)
   1638 if any(not isinstance(key, str) for key in data.keys()):
   1639     raise ValueError("Key values passed to `wandb.log` must be strings.")
-> 1641 self._partial_history_callback(data, step, commit)
   1643 if step is not None:
   1644     if os.getpid() != self._init_pid or self._is_attached:

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:1513, in Run._partial_history_callback(self, row, step, commit)
   1510 if self._backend and self._backend.interface:
   1511     not_using_tensorboard = len(wandb.patched["tensorboard"]) == 0
-> 1513     self._backend.interface.publish_partial_history(
   1514         row,
   1515         user_step=self._step,
   1516         step=step,
   1517         flush=commit,
   1518         publish_step=not_using_tensorboard,
   1519     )

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/interface/interface.py:612, in InterfaceBase.publish_partial_history(self, data, user_step, step, flush, publish_step, run)
    610     item = partial_history.item.add()
    611     item.key = k
--> 612     item.value_json = json_dumps_safer_history(v)
    614 if publish_step and step is not None:
    615     partial_history.step.num = step

File /opt/conda/lib/python3.10/site-packages/wandb/util.py:842, in json_dumps_safer_history(obj, **kwargs)
    840 def json_dumps_safer_history(obj: Any, **kwargs: Any) -> str:
    841     """Convert obj to json, with some extra encodable types, including histograms."""
--> 842     return dumps(obj, cls=WandBHistoryJSONEncoder, **kwargs)

File /opt/conda/lib/python3.10/json/__init__.py:238, in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
    232 if cls is None:
    233     cls = JSONEncoder
    234 return cls(
    235     skipkeys=skipkeys, ensure_ascii=ensure_ascii,
    236     check_circular=check_circular, allow_nan=allow_nan, indent=indent,
    237     separators=separators, default=default, sort_keys=sort_keys,
--> 238     **kw).encode(obj)

File /opt/conda/lib/python3.10/json/encoder.py:199, in JSONEncoder.encode(self, o)
    195         return encode_basestring(o)
    196 # This doesn't pass the iterator directly to ''.join() because the
    197 # exceptions aren't as detailed.  The list call should be roughly
    198 # equivalent to the PySequence_Fast that ''.join() would do.
--> 199 chunks = self.iterencode(o, _one_shot=True)
    200 if not isinstance(chunks, (list, tuple)):
    201     chunks = list(chunks)

File /opt/conda/lib/python3.10/json/encoder.py:257, in JSONEncoder.iterencode(self, o, _one_shot)
    252 else:
    253     _iterencode = _make_iterencode(
    254         markers, self.default, _encoder, self.indent, floatstr,
    255         self.key_separator, self.item_separator, self.sort_keys,
    256         self.skipkeys, _one_shot)
--> 257 return _iterencode(o, 0)

File /opt/conda/lib/python3.10/site-packages/wandb/util.py:803, in WandBHistoryJSONEncoder.default(self, obj)
    802 def default(self, obj: Any) -> Any:
--> 803     obj, converted = json_friendly(obj)
    804     obj, compressed = maybe_compress_history(obj)
    805     if converted:

File /opt/conda/lib/python3.10/site-packages/wandb/util.py:613, in json_friendly(obj)
    611         obj = obj.cpu().detach().numpy()
    612     else:
--> 613         return obj.item(), True
    614 elif is_jax_tensor_typename(typename):
    615     obj = get_jax_tensor(obj)

KeyboardInterrupt: 
In [ ]:
stop
In [46]:
class TestDataset(Dataset) : 
    def __init__(self , df , transform = None) : 
        self.df = df 
        self.transform = transform
        self.file_names = df["img_name"].values
        
    def __len__(self) : 
        return len(self.df)
    
    def __getitem__(self , idx) : 
        
        file_name = self.file_names[idx]
        file_path = f'{TEST_DIR}/{file_name}' 
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image , cv2.COLOR_BGR2RGB)
        if self.transform : 
            augmented = self.transform(image=image)
            image = augmented['image']
            
        return image 
In [47]:
def get_transforms(*, data):
    
    if data == 'train':
        return Compose([
            #Resize(CFG.size, CFG.size),
#             RandomResizedCrop(CFG.size, CFG.size),
            Transpose(p=0.5),
            HorizontalFlip(p=0.5),
            VerticalFlip(p=0.5),
            ShiftScaleRotate(p=0.5),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])

    elif data == 'valid':
        return Compose([
            Resize(CFG.size, CFG.size),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])
#     elif data == "hflip":
#         return Compose([
#             Resize(CFG.size, CFG.size),
#             HorizontalFlip(p=1.0),
#             Normalize(
#                 mean=[0.485, 0.456, 0.406],
#                 std=[0.229, 0.224, 0.225],
#             ),
#             ToTensorV2(),
#         ])
    
#     elif data == "vflip":
#         return Compose([
#             Resize(CFG.size, CFG.size),
#             VerticalFlip(p=1.0),
#             Normalize(
#                 mean=[0.485, 0.456, 0.406],
#                 std=[0.229, 0.224, 0.225],
#             ),
#             ToTensorV2(),
#         ])
In [48]:
model = CustomResNext(CFG.model_name, pretrained=False)
states = []
for fold in [0]:
    try:
        state = torch.load(f"/kaggle/working/efficientnet_b4_fold1_best.pth")
    except FileNotFoundError:
        state = torch.load(f"/kaggle/input/efficient-b2/{CFG.model_name}_fold{fold}_best.pth")
    states.append(state)
tta_preds = []
for tta in [ 'valid']:
    
    test_dataset = TestDataset(test, transform=get_transforms(data=tta))
    test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, 
                             num_workers=CFG.num_workers, pin_memory=True)
    predictions = inference(model, states, test_loader, device)
    tta_preds.append(predictions)
    
100%|██████████| 9211/9211 [26:38<00:00,  5.76it/s]
In [49]:
tta_preds
Out[49]:
[array([[-12.358433 ,  10.60907  ],
        [  5.719613 ,  -5.4534726],
        [  5.9396195,  -6.567122 ],
        ...,
        [  6.322296 ,  -6.785685 ],
        [ -9.897232 ,  10.746263 ],
        [  8.923216 ,  -9.098059 ]], dtype=float32)]
In [50]:
tta_preds = [torch.nn.functional.softmax(torch.from_numpy(tta_preds[i]), dim=1).numpy()[:,1] for i in range(len(tta_preds))]
In [51]:
tta_preds = np.mean(tta_preds, axis=0)
In [52]:
tta_preds
Out[52]:
array([1.0000000e+00, 1.4047021e-05, 3.7016007e-06, ..., 2.0289690e-06,
       1.0000000e+00, 1.4909391e-08], dtype=float32)
In [53]:
sub = pd.read_csv('/kaggle/input/deepfake/phase1/valset_label.txt')
In [54]:
sub["target"] = tta_preds
In [55]:
sub.to_csv('b4_nTTA_2epochs.csv' , index = False)